upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (49) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads.py +6 -2
  3. upgini/ads_management/ads_manager.py +4 -2
  4. upgini/autofe/all_operands.py +16 -4
  5. upgini/autofe/binary.py +2 -1
  6. upgini/autofe/date.py +74 -7
  7. upgini/autofe/feature.py +1 -1
  8. upgini/autofe/groupby.py +3 -1
  9. upgini/autofe/operand.py +4 -3
  10. upgini/autofe/unary.py +20 -1
  11. upgini/autofe/vector.py +2 -0
  12. upgini/data_source/data_source_publisher.py +14 -4
  13. upgini/dataset.py +8 -7
  14. upgini/errors.py +1 -1
  15. upgini/features_enricher.py +156 -63
  16. upgini/http.py +11 -10
  17. upgini/mdc/__init__.py +1 -3
  18. upgini/mdc/context.py +4 -6
  19. upgini/metadata.py +3 -0
  20. upgini/metrics.py +160 -96
  21. upgini/normalizer/phone_normalizer.py +2 -2
  22. upgini/resource_bundle/__init__.py +5 -5
  23. upgini/resource_bundle/strings.properties +9 -4
  24. upgini/sampler/base.py +1 -4
  25. upgini/sampler/random_under_sampler.py +2 -5
  26. upgini/search_task.py +4 -4
  27. upgini/spinner.py +1 -1
  28. upgini/utils/__init__.py +3 -2
  29. upgini/utils/base_search_key_detector.py +2 -2
  30. upgini/utils/blocked_time_series.py +4 -2
  31. upgini/utils/country_utils.py +2 -2
  32. upgini/utils/custom_loss_utils.py +3 -2
  33. upgini/utils/cv_utils.py +2 -2
  34. upgini/utils/datetime_utils.py +75 -18
  35. upgini/utils/deduplicate_utils.py +61 -18
  36. upgini/utils/email_utils.py +3 -3
  37. upgini/utils/fallback_progress_bar.py +1 -1
  38. upgini/utils/features_validator.py +2 -1
  39. upgini/utils/progress_bar.py +1 -1
  40. upgini/utils/sklearn_ext.py +15 -15
  41. upgini/utils/target_utils.py +21 -7
  42. upgini/utils/track_info.py +27 -15
  43. upgini/version_validator.py +2 -2
  44. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
  45. upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
  46. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
  47. upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
  48. upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
  49. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0
upgini/__about__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "1.1.280a3418-2"
upgini/ads.py CHANGED
@@ -5,7 +5,7 @@ from typing import Dict, Optional
5
5
 
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from pandas.api.types import is_string_dtype
8
+ from pandas.api.types import is_object_dtype, is_string_dtype
9
9
 
10
10
  from upgini import SearchKey
11
11
  from upgini.http import get_rest_client
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
34
34
  if df[column_name].notnull().sum() < min_valid_rows_count:
35
35
  raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
36
36
  meaning_type = search_keys[column_name].value
37
- if meaning_type == FileColumnMeaningType.MSISDN and not is_string_dtype(df[column_name]):
37
+ if (
38
+ meaning_type == FileColumnMeaningType.MSISDN
39
+ and not is_string_dtype(df[column_name])
40
+ and not is_object_dtype(df[column_name])
41
+ ):
38
42
  df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
39
43
  else:
40
44
  meaning_type = FileColumnMeaningType.FEATURE
@@ -1,9 +1,11 @@
1
1
  import time
2
- from typing import Dict, Optional
3
2
  import uuid
3
+ from typing import Dict, Optional
4
+
5
+ import pandas as pd
6
+
4
7
  from upgini.http import get_rest_client
5
8
  from upgini.spinner import Spinner
6
- import pandas as pd
7
9
 
8
10
 
9
11
  class AdsManager:
@@ -1,9 +1,10 @@
1
1
  from typing import Dict
2
- from upgini.autofe.date import DateDiff, DateDiffFuture
2
+
3
+ from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
+ from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
3
5
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
4
6
  from upgini.autofe.operand import Operand
5
- from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
6
- from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
7
+ from upgini.autofe.unary import Abs, Bin, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
7
8
  from upgini.autofe.vector import Mean, Sum
8
9
 
9
10
  ALL_OPERANDS: Dict[str, Operand] = {
@@ -37,7 +38,18 @@ ALL_OPERANDS: Dict[str, Operand] = {
37
38
  Operand(name="GroupByThenFreq", output_type="float", is_grouping=True),
38
39
  Sim(),
39
40
  DateDiff(),
40
- DateDiffFuture(),
41
+ DateDiffType2(),
42
+ DateListDiff(aggregation="min"),
43
+ DateListDiff(aggregation="max"),
44
+ DateListDiff(aggregation="mean"),
45
+ DateListDiff(aggregation="nunique"),
46
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
47
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
48
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
49
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
50
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
51
+ DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
+ Bin(),
41
53
  ]
42
54
  }
43
55
 
upgini/autofe/binary.py CHANGED
@@ -1,9 +1,10 @@
1
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
1
  import numpy as np
3
2
  import pandas as pd
4
3
  from numpy import dot
5
4
  from numpy.linalg import norm
6
5
 
6
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
7
+
7
8
 
8
9
  class Min(PandasOperand):
9
10
  name = "min"
upgini/autofe/date.py CHANGED
@@ -1,11 +1,14 @@
1
- from typing import Optional, Union
1
+ from typing import Any, Optional, Union
2
+
2
3
  import numpy as np
3
4
  import pandas as pd
5
+ from pandas.core.arrays.timedeltas import TimedeltaArray
6
+ from pydantic import BaseModel
4
7
 
5
8
  from upgini.autofe.operand import PandasOperand
6
9
 
7
10
 
8
- class DateDiffMixin:
11
+ class DateDiffMixin(BaseModel):
9
12
  diff_unit: str = "D"
10
13
  left_unit: Optional[str] = None
11
14
  right_unit: Optional[str] = None
@@ -34,18 +37,82 @@ class DateDiff(PandasOperand, DateDiffMixin):
34
37
  return x
35
38
 
36
39
 
37
- class DateDiffFuture(PandasOperand, DateDiffMixin):
38
- name = "date_diff_future"
40
+ class DateDiffType2(PandasOperand, DateDiffMixin):
41
+ name = "date_diff_type2"
39
42
  is_binary = True
40
43
  has_symmetry_importance = True
41
- is_vectorizable = False
42
44
 
43
45
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
44
46
  left = self._convert_to_date(left, self.left_unit)
45
47
  right = self._convert_to_date(right, self.right_unit)
46
- future = pd.to_datetime(dict(day=right.dt.day, month=right.dt.month, year=left.dt.year))
48
+ future = right + (left.dt.year - right.dt.year).apply(
49
+ lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
50
+ )
51
+ future = pd.to_datetime(future)
47
52
  before = future[future < left]
48
- future[future < left] = pd.to_datetime(dict(day=before.dt.day, month=before.dt.month, year=before.dt.year + 1))
53
+ future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
49
54
  diff = (future - left) / np.timedelta64(1, self.diff_unit)
50
55
 
51
56
  return diff
57
+
58
+
59
+ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
60
+
61
+
62
+ class DateListDiff(PandasOperand, DateDiffMixin):
63
+ is_binary = True
64
+ has_symmetry_importance = True
65
+ aggregation: str
66
+
67
+ def __init__(self, **data: Any) -> None:
68
+ if "name" not in data:
69
+ data["name"] = f"date_diff_{data.get('aggregation')}"
70
+ super().__init__(**data)
71
+
72
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
73
+ left = self._convert_to_date(left, self.left_unit)
74
+ right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
75
+
76
+ return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
77
+
78
+ def _diff(self, x: TimedeltaArray):
79
+ if self.diff_unit == "Y":
80
+ x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
81
+ elif self.diff_unit == "M":
82
+ raise Exception("Unsupported difference unit: Month")
83
+ else:
84
+ x = x / np.timedelta64(1, self.diff_unit)
85
+ return x[x > 0]
86
+
87
+ def _agg(self, x):
88
+ method = getattr(np, self.aggregation, None)
89
+ default = np.nan
90
+ if method is None and self.aggregation in _ext_aggregations:
91
+ method, default = _ext_aggregations[self.aggregation]
92
+ elif not callable(method):
93
+ raise ValueError(f"Unsupported aggregation: {self.aggregation}")
94
+
95
+ return method(x) if len(x) > 0 else default
96
+
97
+
98
+ class DateListDiffBounded(DateListDiff):
99
+ lower_bound: Optional[int]
100
+ upper_bound: Optional[int]
101
+
102
+ def __init__(self, **data: Any) -> None:
103
+ if "name" not in data:
104
+ lower_bound = data.get("lower_bound")
105
+ upper_bound = data.get("upper_bound")
106
+ components = [
107
+ "date_diff",
108
+ data.get("diff_unit"),
109
+ str(lower_bound if lower_bound is not None else "minusinf"),
110
+ str(upper_bound if upper_bound is not None else "plusinf"),
111
+ ]
112
+ components.append(data.get("aggregation"))
113
+ data["name"] = "_".join(components)
114
+ super().__init__(**data)
115
+
116
+ def _agg(self, x):
117
+ x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
118
+ return super()._agg(x)
upgini/autofe/feature.py CHANGED
@@ -215,7 +215,7 @@ class Feature:
215
215
  return Column(string)
216
216
 
217
217
  def is_trivial_char(c: str) -> bool:
218
- return not (c in "()+-*/,")
218
+ return c not in "()+-*/,"
219
219
 
220
220
  def find_prev(string: str) -> int:
221
221
  if string[-1] != ")":
upgini/autofe/groupby.py CHANGED
@@ -1,7 +1,9 @@
1
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
1
  from typing import Optional
2
+
3
3
  import pandas as pd
4
4
 
5
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
+
5
7
 
6
8
  class GroupByThenAgg(PandasOperand, VectorizableMixin):
7
9
  agg: Optional[str]
upgini/autofe/operand.py CHANGED
@@ -1,8 +1,9 @@
1
- from pydantic import BaseModel
2
- from typing import Dict, List, Optional, Tuple, Union
3
1
  import abc
4
- import pandas as pd
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
5
4
  import numpy as np
5
+ import pandas as pd
6
+ from pydantic import BaseModel
6
7
 
7
8
 
8
9
  class Operand(BaseModel):
upgini/autofe/unary.py CHANGED
@@ -1,7 +1,8 @@
1
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
2
1
  import numpy as np
3
2
  import pandas as pd
4
3
 
4
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+
5
6
 
6
7
  class Abs(PandasOperand, VectorizableMixin):
7
8
  name = "abs"
@@ -110,3 +111,21 @@ class Freq(PandasOperand):
110
111
  def calculate_unary(self, data: pd.Series) -> pd.Series:
111
112
  value_counts = data.value_counts(normalize=True)
112
113
  return self._loc(data, value_counts)
114
+
115
+
116
+ class Bin(PandasOperand):
117
+ name = "bin"
118
+ is_unary = True
119
+ output_type = "int"
120
+ input_type = "discrete"
121
+
122
+ zero_bound_low: int
123
+ zero_bound_high: int
124
+ step: int
125
+
126
+ def calculate_unary(self, data: pd.Series) -> pd.Series:
127
+ res = pd.Series(np.zeros(data.shape), index=data.index, dtype="int")
128
+ res.update((data[data < self.zero_bound_low] - self.zero_bound_low) // self.step)
129
+ res.update((data[data >= self.zero_bound_high] - self.zero_bound_high) // self.step + 1)
130
+
131
+ return res
upgini/autofe/vector.py CHANGED
@@ -1,5 +1,7 @@
1
1
  from typing import List
2
+
2
3
  import pandas as pd
4
+
3
5
  from upgini.autofe.operand import PandasOperand, VectorizableMixin
4
6
 
5
7
 
@@ -48,6 +48,7 @@ class DataSourcePublisher:
48
48
  data_table_uri: str,
49
49
  search_keys: Dict[str, SearchKey],
50
50
  update_frequency: str,
51
+ exclude_from_autofe_generation: Optional[List[str]],
51
52
  secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
52
53
  sort_column: Optional[str] = None,
53
54
  date_format: Optional[str] = None,
@@ -57,7 +58,6 @@ class DataSourcePublisher:
57
58
  join_date_abs_limit_days: Optional[int] = None,
58
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
59
60
  data_table_id_to_replace: Optional[str] = None,
60
- exclude_from_autofe_generation: Optional[List[str]] = None,
61
61
  _force_generation=False,
62
62
  _silent=False,
63
63
  ) -> str:
@@ -72,8 +72,8 @@ class DataSourcePublisher:
72
72
  )
73
73
  if search_keys is None or len(search_keys) == 0:
74
74
  raise ValidationError("Empty search keys")
75
- if SearchKey.DATE in search_keys.values() and date_format is None:
76
- raise ValidationError("date_format is required for DATE search key")
75
+ # if SearchKey.DATE in search_keys.values() and date_format is None:
76
+ # raise ValidationError("date_format is required for DATE search key")
77
77
  if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
78
78
  raise ValidationError(
79
79
  f"Invalid update frequency: {update_frequency}. "
@@ -85,11 +85,19 @@ class DataSourcePublisher:
85
85
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
86
86
  ) and sort_column is None:
87
87
  raise ValidationError("Sort column is required for passed search keys")
88
+ if (
89
+ set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
90
+ and snapshot_frequency_days is None
91
+ and join_date_abs_limit_days is None
92
+ ):
93
+ raise ValidationError(
94
+ "With MSISDN and DATE keys one of the snapshot_frequency_days or"
95
+ " join_date_abs_limit_days parameters is required"
96
+ )
88
97
 
89
98
  request = {
90
99
  "dataTableUri": data_table_uri,
91
100
  "searchKeys": {k: v.value.value for k, v in search_keys.items()},
92
- "dateFormat": date_format,
93
101
  "excludeColumns": exclude_columns,
94
102
  "hashFeatureNames": str(hash_feature_names).lower(),
95
103
  "snapshotFrequencyDays": snapshot_frequency_days,
@@ -98,6 +106,8 @@ class DataSourcePublisher:
98
106
  "featuresForEmbeddings": features_for_embeddings,
99
107
  "forceGeneration": str(_force_generation).lower(),
100
108
  }
109
+ if date_format is not None:
110
+ request["dateFormat"] = date_format
101
111
  if secondary_search_keys is not None:
102
112
  request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
103
113
  if sort_column is not None:
upgini/dataset.py CHANGED
@@ -15,6 +15,7 @@ from pandas.api.types import (
15
15
  is_float_dtype,
16
16
  is_integer_dtype,
17
17
  is_numeric_dtype,
18
+ is_object_dtype,
18
19
  is_period_dtype,
19
20
  is_string_dtype,
20
21
  )
@@ -60,7 +61,7 @@ class Dataset: # (pd.DataFrame):
60
61
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
61
62
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
62
63
  MIN_SAMPLE_THRESHOLD = 5_000
63
- IMBALANCE_THESHOLD = 0.4
64
+ IMBALANCE_THESHOLD = 0.6
64
65
  BINARY_BOOTSTRAP_LOOPS = 5
65
66
  MULTICLASS_BOOTSTRAP_LOOPS = 2
66
67
  MIN_TARGET_CLASS_ROWS = 100
@@ -94,7 +95,7 @@ class Dataset: # (pd.DataFrame):
94
95
  data = pd.read_csv(path, **kwargs)
95
96
  else:
96
97
  # try different separators: , ; \t ...
97
- with open(path, mode="r") as csvfile:
98
+ with open(path) as csvfile:
98
99
  sep = csv.Sniffer().sniff(csvfile.read(2048)).delimiter
99
100
  kwargs["sep"] = sep
100
101
  data = pd.read_csv(path, **kwargs)
@@ -219,7 +220,7 @@ class Dataset: # (pd.DataFrame):
219
220
  """Check that string values less than maximum characters for LLM"""
220
221
  # self.logger.info("Validate too long string values")
221
222
  for col in self.data.columns:
222
- if is_string_dtype(self.data[col]):
223
+ if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
223
224
  max_length: int = self.data[col].astype("str").str.len().max()
224
225
  if max_length > self.MAX_STRING_FEATURE_LENGTH:
225
226
  self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
@@ -250,7 +251,7 @@ class Dataset: # (pd.DataFrame):
250
251
  @staticmethod
251
252
  def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
252
253
  try:
253
- if isinstance(ip, IPv4Address) or isinstance(ip, IPv6Address):
254
+ if isinstance(ip, (IPv4Address, IPv6Address)):
254
255
  return int(ip)
255
256
  except Exception:
256
257
  pass
@@ -258,7 +259,7 @@ class Dataset: # (pd.DataFrame):
258
259
  @staticmethod
259
260
  def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
260
261
  try:
261
- if isinstance(ip, IPv4Address) or isinstance(ip, IPv6Address):
262
+ if isinstance(ip, (IPv4Address, IPv6Address)):
262
263
  return str(int(ip))
263
264
  except Exception:
264
265
  pass
@@ -350,7 +351,7 @@ class Dataset: # (pd.DataFrame):
350
351
  if postal_code is not None and postal_code in self.data.columns:
351
352
  # self.logger.info("Normalize postal code")
352
353
 
353
- if is_string_dtype(self.data[postal_code]):
354
+ if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
354
355
  try:
355
356
  self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
356
357
  except Exception:
@@ -821,7 +822,7 @@ class Dataset: # (pd.DataFrame):
821
822
  return DataType.INT
822
823
  elif is_float_dtype(pandas_data_type):
823
824
  return DataType.DECIMAL
824
- elif is_string_dtype(pandas_data_type):
825
+ elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
825
826
  return DataType.STRING
826
827
  else:
827
828
  msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
upgini/errors.py CHANGED
@@ -16,7 +16,7 @@ class UnauthorizedError(HttpError):
16
16
  """Unauthorized error from REST API."""
17
17
 
18
18
  def __init__(self, message, status_code):
19
- message = "Unauthorized, please check your authorization token ({})".format(message)
19
+ message = f"Unauthorized, please check your authorization token ({message})"
20
20
  super(UnauthorizedError, self).__init__(message, status_code)
21
21
 
22
22