upgini 1.1.264a1__tar.gz → 1.1.265__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (85) hide show
  1. {upgini-1.1.264a1/src/upgini.egg-info → upgini-1.1.265}/PKG-INFO +1 -1
  2. {upgini-1.1.264a1 → upgini-1.1.265}/setup.py +1 -1
  3. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/features_enricher.py +6 -1
  4. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/strings.properties +1 -0
  5. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/datetime_utils.py +49 -1
  6. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/target_utils.py +2 -6
  7. {upgini-1.1.264a1 → upgini-1.1.265/src/upgini.egg-info}/PKG-INFO +1 -1
  8. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_datetime_utils.py +30 -2
  9. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_features_enricher.py +2 -0
  10. {upgini-1.1.264a1 → upgini-1.1.265}/LICENSE +0 -0
  11. {upgini-1.1.264a1 → upgini-1.1.265}/README.md +0 -0
  12. {upgini-1.1.264a1 → upgini-1.1.265}/pyproject.toml +0 -0
  13. {upgini-1.1.264a1 → upgini-1.1.265}/setup.cfg +0 -0
  14. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/__init__.py +0 -0
  15. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/ads.py +0 -0
  16. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/ads_management/__init__.py +0 -0
  17. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/ads_management/ads_manager.py +0 -0
  18. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/__init__.py +0 -0
  19. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/all_operands.py +0 -0
  20. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/binary.py +0 -0
  21. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/date.py +0 -0
  22. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/feature.py +0 -0
  23. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/groupby.py +0 -0
  24. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/operand.py +0 -0
  25. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/unary.py +0 -0
  26. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/vector.py +0 -0
  27. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/data_source/__init__.py +0 -0
  28. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/data_source/data_source_publisher.py +0 -0
  29. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/dataset.py +0 -0
  30. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/errors.py +0 -0
  31. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/fingerprint.js +0 -0
  32. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/http.py +0 -0
  33. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/mdc/__init__.py +0 -0
  34. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/mdc/context.py +0 -0
  35. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/metadata.py +0 -0
  36. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/metrics.py +0 -0
  37. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/normalizer/__init__.py +0 -0
  38. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/normalizer/phone_normalizer.py +0 -0
  39. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/__init__.py +0 -0
  40. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/exceptions.py +0 -0
  41. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  42. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/__init__.py +0 -0
  43. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/base.py +0 -0
  44. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/random_under_sampler.py +0 -0
  45. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/utils.py +0 -0
  46. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/search_task.py +0 -0
  47. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/spinner.py +0 -0
  48. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/features_validator.py +0 -0
  59. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/format.py +0 -0
  60. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/ip_utils.py +0 -0
  61. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/phone_utils.py +0 -0
  62. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/postal_code_utils.py +0 -0
  63. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/progress_bar.py +0 -0
  64. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/sklearn_ext.py +0 -0
  65. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/version_validator.py +0 -0
  68. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/SOURCES.txt +0 -0
  69. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/dependency_links.txt +0 -0
  70. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/requires.txt +0 -0
  71. {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/top_level.txt +0 -0
  72. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_autofe_operands.py +0 -0
  73. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_binary_dataset.py +0 -0
  74. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_blocked_time_series.py +0 -0
  75. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_categorical_dataset.py +0 -0
  76. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_continuous_dataset.py +0 -0
  77. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_country_utils.py +0 -0
  78. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_custom_loss_utils.py +0 -0
  79. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_email_utils.py +0 -0
  80. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_etalon_validation.py +0 -0
  81. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_metrics.py +0 -0
  82. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_phone_utils.py +0 -0
  83. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_postal_code_utils.py +0 -0
  84. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_target_utils.py +0 -0
  85. {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.264a1
3
+ Version: 1.1.265
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.264a1"
43
+ version = "1.1.265"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -70,6 +70,7 @@ from upgini.utils.datetime_utils import (
70
70
  DateTimeSearchKeyConverter,
71
71
  is_blocked_time_series,
72
72
  is_time_series,
73
+ validate_dates_distribution,
73
74
  )
74
75
  from upgini.utils.deduplicate_utils import (
75
76
  clean_full_duplicates,
@@ -1922,7 +1923,7 @@ class FeaturesEnricher(TransformerMixin):
1922
1923
 
1923
1924
  meaning_types = {col: key.value for col, key in search_keys.items()}
1924
1925
  non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
1925
- # Don't pass
1926
+
1926
1927
  if email_converted_to_hem:
1927
1928
  non_keys_columns.append(email_column)
1928
1929
 
@@ -2221,6 +2222,10 @@ class FeaturesEnricher(TransformerMixin):
2221
2222
  self.fit_search_keys = self.search_keys.copy()
2222
2223
  self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
2223
2224
 
2225
+ validate_dates_distribution(
2226
+ validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
2227
+ )
2228
+
2224
2229
  has_date = self._get_date_column(self.fit_search_keys) is not None
2225
2230
  model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
2226
2231
  self._validate_binary_observations(validated_y, model_task_type)
@@ -111,6 +111,7 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
114
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
114
115
  # eval set validation
115
116
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
116
117
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -1,7 +1,7 @@
1
1
  import datetime
2
2
  import logging
3
3
  import re
4
- from typing import List, Optional
4
+ from typing import Dict, List, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
9
9
  from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
10
10
 
11
11
  from upgini.errors import ValidationError
12
+ from upgini.metadata import SearchKey
12
13
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
14
+ from upgini.utils.warning_counter import WarningCounter
13
15
 
14
16
  DATE_FORMATS = [
15
17
  "%Y-%m-%d",
@@ -225,3 +227,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
225
227
 
226
228
  is_diff_less_than_two_columns = grouped.apply(check_differences)
227
229
  return is_diff_less_than_two_columns.all()
230
+
231
+
232
+ def validate_dates_distribution(
233
+ X: pd.DataFrame,
234
+ search_keys: Dict[str, SearchKey],
235
+ logger: Optional[logging.Logger] = None,
236
+ bundle: Optional[ResourceBundle] = None,
237
+ warning_counter: Optional[WarningCounter] = None,
238
+ ):
239
+ maybe_date_col = None
240
+ for key, key_type in search_keys.items():
241
+ if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
242
+ maybe_date_col = key
243
+
244
+ if maybe_date_col is None:
245
+ for col in X.columns:
246
+ if col in search_keys:
247
+ continue
248
+ try:
249
+ pd.to_datetime(X[col])
250
+ maybe_date_col = col
251
+ break
252
+ except Exception:
253
+ pass
254
+
255
+ if maybe_date_col is None:
256
+ return
257
+
258
+ dates = pd.to_datetime(X[maybe_date_col]).dt.date
259
+
260
+ date_counts = dates.value_counts().sort_index()
261
+
262
+ date_counts_1 = date_counts[: round(len(date_counts) / 2)]
263
+ date_counts_2 = date_counts[round(len(date_counts) / 2) :]
264
+ ratio = date_counts_2.mean() / date_counts_1.mean()
265
+
266
+ if ratio > 1.2 or ratio < 0.8:
267
+ if warning_counter is not None:
268
+ warning_counter.increment()
269
+ if logger is None:
270
+ logger = logging.getLogger("muted_logger")
271
+ logger.setLevel("FATAL")
272
+ bundle = bundle or get_custom_bundle()
273
+ msg = bundle.get("x_unstable_by_date")
274
+ print(msg)
275
+ logger.warning(msg)
@@ -132,9 +132,7 @@ def balance_undersample(
132
132
  class_value = classes[class_idx]
133
133
  class_count = vc[class_value]
134
134
  sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
135
- sampler = RandomUnderSampler(
136
- sampling_strategy=sample_strategy, random_state=random_state
137
- )
135
+ sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
138
136
  X = df[SYSTEM_RECORD_ID]
139
137
  X = X.to_frame(SYSTEM_RECORD_ID)
140
138
  new_x, _ = sampler.fit_resample(X, target) # type: ignore
@@ -153,9 +151,7 @@ def balance_undersample(
153
151
  minority_class = df[df[target_column] == min_class_value]
154
152
  majority_class = df[df[target_column] != min_class_value]
155
153
  sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
156
- sampled_majority_class = majority_class.sample(
157
- n=sample_size, random_state=random_state
158
- )
154
+ sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
159
155
  resampled_data = df[
160
156
  (df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
161
157
  | (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.264a1
3
+ Version: 1.1.265
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -1,7 +1,13 @@
1
- import pandas as pd
2
1
  import numpy as np
2
+ import pandas as pd
3
3
 
4
- from upgini.utils.datetime_utils import is_blocked_time_series, is_time_series
4
+ from upgini.metadata import SearchKey
5
+ from upgini.utils.datetime_utils import (
6
+ is_blocked_time_series,
7
+ is_time_series,
8
+ validate_dates_distribution,
9
+ )
10
+ from upgini.utils.warning_counter import WarningCounter
5
11
 
6
12
  pd.set_option("mode.chained_assignment", "raise")
7
13
 
@@ -183,3 +189,25 @@ def test_multivariate_time_series():
183
189
  assert not is_blocked_time_series(df, "date", ["date"])
184
190
 
185
191
  assert is_blocked_time_series(df, "date", ["date", "feature3"])
192
+
193
+
194
+ def test_validate_dates_distribution():
195
+ df = pd.DataFrame({"date": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
196
+ warning_counter = WarningCounter()
197
+ validate_dates_distribution(df, {}, warning_counter=warning_counter)
198
+ assert warning_counter.has_warnings()
199
+
200
+ df = pd.DataFrame({"date": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
201
+ warning_counter = WarningCounter()
202
+ validate_dates_distribution(df, {}, warning_counter=warning_counter)
203
+ assert not warning_counter.has_warnings()
204
+
205
+ df = pd.DataFrame(
206
+ {
207
+ "date2": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
208
+ "date1": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
209
+ }
210
+ )
211
+ warning_counter = WarningCounter()
212
+ validate_dates_distribution(df, {"date1": SearchKey.DATE}, warning_counter=warning_counter)
213
+ assert warning_counter.has_warnings()
@@ -2164,6 +2164,8 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
2164
2164
 
2165
2165
  actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
2166
2166
  # actual_result_df.to_parquet(expected_result_path)
2167
+ actual_result_df["phone_num_a54a33"] = actual_result_df["phone_num_a54a33"].astype("Int64")
2168
+ actual_result_df["rep_date_f5d6bb"] = actual_result_df["rep_date_f5d6bb"].astype("Int64")
2167
2169
  assert_frame_equal(actual_result_df, expected_result_df)
2168
2170
 
2169
2171
  for i in range(5):
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes