upgini 1.1.274a1__tar.gz → 1.1.274a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.274a1/src/upgini.egg-info → upgini-1.1.274a3}/PKG-INFO +1 -1
  2. {upgini-1.1.274a1 → upgini-1.1.274a3}/setup.py +1 -1
  3. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/date.py +1 -0
  4. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/data_source/data_source_publisher.py +1 -1
  5. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/features_enricher.py +22 -8
  6. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/metrics.py +1 -0
  7. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/normalizer/phone_normalizer.py +2 -2
  8. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/datetime_utils.py +3 -0
  9. {upgini-1.1.274a1 → upgini-1.1.274a3/src/upgini.egg-info}/PKG-INFO +1 -1
  10. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_autofe_operands.py +2 -1
  11. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_features_enricher.py +5 -6
  12. {upgini-1.1.274a1 → upgini-1.1.274a3}/LICENSE +0 -0
  13. {upgini-1.1.274a1 → upgini-1.1.274a3}/README.md +0 -0
  14. {upgini-1.1.274a1 → upgini-1.1.274a3}/pyproject.toml +0 -0
  15. {upgini-1.1.274a1 → upgini-1.1.274a3}/setup.cfg +0 -0
  16. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/__init__.py +0 -0
  17. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/ads.py +0 -0
  18. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/ads_management/__init__.py +0 -0
  19. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/ads_management/ads_manager.py +0 -0
  20. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/__init__.py +0 -0
  21. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/all_operands.py +0 -0
  22. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/binary.py +0 -0
  23. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/feature.py +0 -0
  24. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/operand.py +0 -0
  26. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/unary.py +0 -0
  27. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/autofe/vector.py +0 -0
  28. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/data_source/__init__.py +0 -0
  29. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/dataset.py +0 -0
  30. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/errors.py +0 -0
  31. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/fingerprint.js +0 -0
  32. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/http.py +0 -0
  33. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/mdc/__init__.py +0 -0
  34. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/mdc/context.py +0 -0
  35. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/metadata.py +0 -0
  36. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/normalizer/__init__.py +0 -0
  37. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/resource_bundle/strings.properties +0 -0
  40. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/search_task.py +0 -0
  46. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/spinner.py +0 -0
  47. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini/version_validator.py +0 -0
  68. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini.egg-info/SOURCES.txt +0 -0
  69. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini.egg-info/dependency_links.txt +0 -0
  70. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini.egg-info/requires.txt +0 -0
  71. {upgini-1.1.274a1 → upgini-1.1.274a3}/src/upgini.egg-info/top_level.txt +0 -0
  72. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_binary_dataset.py +0 -0
  73. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_blocked_time_series.py +0 -0
  74. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_categorical_dataset.py +0 -0
  75. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_continuous_dataset.py +0 -0
  76. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_country_utils.py +0 -0
  77. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_custom_loss_utils.py +0 -0
  78. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_datetime_utils.py +0 -0
  79. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_email_utils.py +0 -0
  80. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_etalon_validation.py +0 -0
  81. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.274a1 → upgini-1.1.274a3}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.274a1
3
+ Version: 1.1.274a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.274a1"
43
+ version = "1.1.274a3"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -46,6 +46,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
46
46
  future = right + (left.dt.year - right.dt.year).apply(
47
47
  lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
48
48
  )
49
+ future = pd.to_datetime(future)
49
50
  before = future[future < left]
50
51
  future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
51
52
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
@@ -48,6 +48,7 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
+ exclude_from_autofe_generation: Optional[List[str]],
51
52
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
52
53
  sort_column: Optional[str] = None,
53
54
  date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
57
58
  join_date_abs_limit_days: Optional[int] = None,
58
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
59
60
  data_table_id_to_replace: Optional[str] = None,
60
- exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -1,4 +1,5 @@
1
1
  import dataclasses
2
+ import datetime
2
3
  import gc
3
4
  import hashlib
4
5
  import itertools
@@ -146,6 +147,7 @@ class FeaturesEnricher(TransformerMixin):
146
147
  """
147
148
 
148
149
  TARGET_NAME = "target"
150
+ CURRENT_DATE = "current_date"
149
151
  RANDOM_STATE = 42
150
152
  CALCULATE_METRICS_THRESHOLD = 50_000_000
151
153
  CALCULATE_METRICS_MIN_THRESHOLD = 500
@@ -207,6 +209,7 @@ class FeaturesEnricher(TransformerMixin):
207
209
  client_ip: Optional[str] = None,
208
210
  client_visitorid: Optional[str] = None,
209
211
  custom_bundle_config: Optional[str] = None,
212
+ add_date_if_missing: bool = True,
210
213
  **kwargs,
211
214
  ):
212
215
  self.bundle = get_custom_bundle(custom_bundle_config)
@@ -317,6 +320,7 @@ class FeaturesEnricher(TransformerMixin):
317
320
  self.raise_validation_error = raise_validation_error
318
321
  self.exclude_columns = exclude_columns
319
322
  self.baseline_score_column = baseline_score_column
323
+ self.add_date_if_missing = add_date_if_missing
320
324
 
321
325
  def _get_api_key(self):
322
326
  return self._api_key
@@ -1804,10 +1808,13 @@ class FeaturesEnricher(TransformerMixin):
1804
1808
  else:
1805
1809
  features_section = ""
1806
1810
 
1807
- api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
1811
+ search_id = self._search_task.search_task_id
1812
+ api_example = (
1813
+ f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
1808
1814
  -H 'Authorization: {self.api_key}' \\
1809
1815
  -H 'Content-Type: application/json' \\
1810
- -d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
1816
+ -d '{{"search_keys": {keys}{features_section}}}'"""
1817
+ )
1811
1818
  return api_example
1812
1819
 
1813
1820
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -1902,6 +1909,9 @@ class FeaturesEnricher(TransformerMixin):
1902
1909
  generated_features.extend(converter.generated_features)
1903
1910
  else:
1904
1911
  self.logger.info("Input dataset hasn't date column")
1912
+ if self.add_date_if_missing:
1913
+ df = self._add_current_date_as_key(df)
1914
+ search_keys[self.CURRENT_DATE] = SearchKey.DATE
1905
1915
  email_column = self._get_email_column(search_keys)
1906
1916
  hem_column = self._get_hem_column(search_keys)
1907
1917
  email_converted_to_hem = False
@@ -2220,9 +2230,7 @@ class FeaturesEnricher(TransformerMixin):
2220
2230
  self.fit_search_keys = self.search_keys.copy()
2221
2231
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2222
2232
 
2223
- validate_dates_distribution(
2224
- validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2225
- )
2233
+ validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
2226
2234
 
2227
2235
  maybe_date_column = self._get_date_column(self.fit_search_keys)
2228
2236
  has_date = maybe_date_column is not None
@@ -2273,6 +2281,9 @@ class FeaturesEnricher(TransformerMixin):
2273
2281
  self.fit_generated_features.extend(converter.generated_features)
2274
2282
  else:
2275
2283
  self.logger.info("Input dataset hasn't date column")
2284
+ if self.add_date_if_missing:
2285
+ df = self._add_current_date_as_key(df)
2286
+ self.fit_search_keys[self.CURRENT_DATE] = SearchKey.DATE
2276
2287
  email_column = self._get_email_column(self.fit_search_keys)
2277
2288
  hem_column = self._get_hem_column(self.fit_search_keys)
2278
2289
  email_converted_to_hem = False
@@ -2853,6 +2864,11 @@ class FeaturesEnricher(TransformerMixin):
2853
2864
  if t in [SearchKey.DATE, SearchKey.DATETIME]:
2854
2865
  return col
2855
2866
 
2867
+ @staticmethod
2868
+ def _add_current_date_as_key(df: pd.DataFrame) -> pd.DataFrame:
2869
+ df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
2870
+ return df
2871
+
2856
2872
  @staticmethod
2857
2873
  def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
2858
2874
  return [
@@ -2903,9 +2919,7 @@ class FeaturesEnricher(TransformerMixin):
2903
2919
  [
2904
2920
  c
2905
2921
  for c in df.columns
2906
- if c not in sort_columns
2907
- and c not in sort_exclude_columns
2908
- and df[c].nunique() > 1
2922
+ if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
2909
2923
  ]
2910
2924
  # [
2911
2925
  # sk
@@ -645,6 +645,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
645
645
 
646
646
 
647
647
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
648
+ # TODO validate that if it is Callable then it accepts 3 arguments
648
649
  if isinstance(scoring, str) and scoring is not None:
649
650
  _get_scorer_by_name(scoring)
650
651
 
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -44,7 +44,7 @@ class PhoneNormalizer:
44
44
  Method will remove all non numeric chars from string and convert it to int.
45
45
  None will be set for phone numbers that couldn"t be converted to int
46
46
  """
47
- if is_string_dtype(self.df[self.phone_column_name]):
47
+ if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
48
  convert_func = self.phone_str_to_int_safe
49
49
  elif is_float_dtype(self.df[self.phone_column_name]):
50
50
  convert_func = self.phone_float_to_int_safe
@@ -100,6 +100,9 @@ class DateTimeSearchKeyConverter:
100
100
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
101
101
  self.logger.warning(msg)
102
102
  raise ValidationError(msg)
103
+ else:
104
+ df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
105
+ df[self.date_column] = self.parse_date(df)
103
106
 
104
107
  # If column with date is datetime then extract seconds of the day and minute of the hour
105
108
  # as additional features
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.274a1
3
+ Version: 1.1.274a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -30,7 +30,8 @@ def test_date_diff_type2():
30
30
 
31
31
  operand = DateDiffType2(left_unit="s")
32
32
  expected_result = pd.Series([61.0, 182.0])
33
- assert_series_equal(operand.calculate_binary(df.date1, df.date2), expected_result)
33
+ actual = operand.calculate_binary(df.date1, df.date2)
34
+ assert_series_equal(actual, expected_result)
34
35
 
35
36
 
36
37
  def test_date_diff_list():
@@ -246,7 +246,7 @@ def test_eval_set_with_diff_order_of_columns(requests_mock: Mocker):
246
246
  eval1_df = df[10000:11000].reset_index(drop=True)
247
247
  eval1_features = eval1_df.drop(columns="target")
248
248
  # shuffle columns
249
- eval1_features = eval1_features[set(eval1_features.columns)]
249
+ eval1_features = eval1_features[list(eval1_features.columns)]
250
250
  eval1_target = eval1_df["target"].reset_index(drop=True)
251
251
 
252
252
  eval2_df = df[11000:12000]
@@ -375,7 +375,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
375
375
  url = "http://fake_url2"
376
376
 
377
377
  path_to_mock_features = os.path.join(
378
- os.path.dirname(os.path.realpath(__file__)), "test_data/binary/validation_features.parquet"
378
+ os.path.dirname(os.path.realpath(__file__)), "test_data/binary/validation_features_v2.parquet"
379
379
  )
380
380
 
381
381
  mock_default_requests(requests_mock, url)
@@ -462,7 +462,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
462
462
  segment_header: [train_segment, eval_1_segment, eval_2_segment],
463
463
  rows_header: [10000, 1000, 1000],
464
464
  target_mean_header: [0.5044, 0.487, 0.486],
465
- enriched_gini: [-0.000136, 0.000000, -0.003728],
465
+ enriched_gini: [0.021830, -0.006607, -0.018483],
466
466
  }
467
467
  )
468
468
  print("Expected metrics: ")
@@ -496,7 +496,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
496
496
  segment_header: [train_segment],
497
497
  rows_header: [10000],
498
498
  target_mean_header: [0.049],
499
- enriched_gini: [0.000985],
499
+ enriched_gini: [0.054454],
500
500
  }
501
501
  )
502
502
  print("Expected metrics: ")
@@ -2652,5 +2652,4 @@ class DataFrameWrapper:
2652
2652
 
2653
2653
 
2654
2654
  class TestException(Exception):
2655
- def __init__(self):
2656
- super().__init__()
2655
+ pass
File without changes
File without changes
File without changes
File without changes
File without changes