upgini 1.1.280a3418.post10__tar.gz → 1.1.280.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (64) hide show
  1. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/PKG-INFO +1 -1
  2. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/pyproject.toml +23 -10
  3. upgini-1.1.280.dev0/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/all_operands.py +1 -2
  5. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/date.py +2 -86
  6. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/feature.py +4 -16
  7. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/operand.py +2 -4
  8. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/features_enricher.py +8 -2
  9. upgini-1.1.280a3418.post10/src/upgini/__about__.py +0 -1
  10. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/.gitignore +0 -0
  11. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/LICENSE +0 -0
  12. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/README.md +0 -0
  13. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/__init__.py +0 -0
  14. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/ads.py +0 -0
  15. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/unary.py +0 -0
  21. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/autofe/vector.py +0 -0
  22. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/data_source/__init__.py +0 -0
  23. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/data_source/data_source_publisher.py +0 -0
  24. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/dataset.py +0 -0
  25. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/errors.py +0 -0
  26. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/http.py +0 -0
  27. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/mdc/__init__.py +0 -0
  28. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/mdc/context.py +0 -0
  29. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/metadata.py +0 -0
  30. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/metrics.py +0 -0
  31. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/normalizer/__init__.py +0 -0
  32. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/normalizer/phone_normalizer.py +0 -0
  33. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/resource_bundle/__init__.py +0 -0
  34. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/resource_bundle/exceptions.py +0 -0
  35. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/resource_bundle/strings.properties +0 -0
  36. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  37. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/sampler/__init__.py +0 -0
  38. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/sampler/base.py +0 -0
  39. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/sampler/random_under_sampler.py +0 -0
  40. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/sampler/utils.py +0 -0
  41. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/search_task.py +0 -0
  42. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/spinner.py +0 -0
  43. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/__init__.py +0 -0
  44. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/base_search_key_detector.py +0 -0
  45. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/blocked_time_series.py +0 -0
  46. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/country_utils.py +0 -0
  47. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/custom_loss_utils.py +0 -0
  48. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/cv_utils.py +0 -0
  49. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/datetime_utils.py +0 -0
  50. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/deduplicate_utils.py +0 -0
  51. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/display_utils.py +0 -0
  52. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/email_utils.py +0 -0
  53. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/fallback_progress_bar.py +0 -0
  54. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/features_validator.py +0 -0
  55. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/format.py +0 -0
  56. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/ip_utils.py +0 -0
  57. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/phone_utils.py +0 -0
  58. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/postal_code_utils.py +0 -0
  59. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/progress_bar.py +0 -0
  60. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/sklearn_ext.py +0 -0
  61. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/target_utils.py +0 -0
  62. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/track_info.py +0 -0
  63. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/utils/warning_counter.py +0 -0
  64. {upgini-1.1.280a3418.post10 → upgini-1.1.280.dev0}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.280a3418.post10
3
+ Version: 1.1.280.dev0
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -72,23 +72,36 @@ packages = [
72
72
  type = "virtual"
73
73
  python = "3.10"
74
74
 
75
- [tool.hatch.envs.test]
76
- dependencies = [
77
- "coverage[toml]",
78
- "pytest",
79
- "pytest-cov",
80
- "requests-mock",
81
- ]
82
-
83
75
  [tool.hatch.envs.test.scripts]
84
- cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests {args}'
76
+ cov = 'pytest --cov-report=term-missing --cov-config=pyproject.toml --cov=upgini --cov=tests'
85
77
  format = "black {args}"
86
78
  lint = "ruff check {args}"
87
-
88
79
  test_binary = 'pytest -s -vv tests/test_binary_dataset.py'
89
80
 
81
+ [[tool.hatch.envs.test.matrix]]
82
+ python = ["3.8"]
83
+ pandas = ["1.1.0"]
84
+
90
85
  [[tool.hatch.envs.test.matrix]]
91
86
  python = ["3.8", "3.9", "3.10"]
87
+ pandas = ["1.2.0", "1.3.0", "1.4.0", "1.5.0", "2.0.0"]
88
+
89
+ [[tool.hatch.envs.test.matrix]]
90
+ python = ["3.9", "3.10"]
91
+ pandas = ["2.1.0", "2.2.0"]
92
+
93
+ # from versions: 0.1, 0.2, 0.3.0, 0.4.0, 0.4.1, 0.4.2, 0.4.3, 0.5.0, 0.6.0, 0.6.1, 0.7.0, 0.7.1, 0.7.2, 0.7.3, 0.8.0, 0.8.1, 0.9.0, 0.9.1, 0.10.0, 0.10.1, 0.11.0, 0.12.0, 0.13.0, 0.13.1, 0.14.0, 0.14.1, 0.15.0, 0.15.1, 0.15.2, 0.16.0, 0.16.1, 0.16.2, 0.17.0, 0.17.1, 0.18.0, 0.18.1, 0.19.0, 0.19.1, 0.19.2, 0.20.0, 0.20.1, 0.20.2, 0.20.3, 0.21.0, 0.21.1, 0.22.0, 0.23.0, 0.23.1, 0.23.2, 0.23.3, 0.23.4, 0.24.0, 0.24.1, 0.24.2, 0.25.0, 0.25.1, 0.25.2, 0.25.3, 1.0.0, 1.0.1, 1.0.2, 1.0.3, 1.0.4, 1.0.5, 1.1.0, 1.1.1, 1.1.2, 1.1.3, 1.1.4, 1.1.5, 1.2.0, 1.2.1, 1.2.2, 1.2.3, 1.2.4, 1.2.5, 1.3.0, 1.3.1, 1.3.2, 1.3.3, 1.3.4, 1.3.5, 1.4.0rc0, 1.4.0, 1.4.1, 1.4.2, 1.4.3, 1.4.4, 1.5.0rc0, 1.5.0, 1.5.1, 1.5.2, 1.5.3, 2.0.0rc0, 2.0.0rc1, 2.0.0, 2.0.1, 2.0.2, 2.0.3
94
+
95
+ [tool.hatch.envs.test]
96
+ dependencies = [
97
+ "coverage[toml]",
98
+ "pytest",
99
+ "pytest-cov",
100
+ # "pytest-timeout",
101
+ "requests-mock",
102
+ "pytest-datafiles",
103
+ "pandas~={matrix:pandas}.0",
104
+ ]
92
105
 
93
106
  [tool.black]
94
107
  line-length = 120
@@ -0,0 +1 @@
1
+ __version__ = "1.1.280.dev0"
@@ -1,7 +1,7 @@
1
1
  from typing import Dict
2
2
 
3
3
  from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
- from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
4
+ from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
5
5
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
6
6
  from upgini.autofe.operand import Operand
7
7
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
@@ -49,7 +49,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
49
49
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
50
50
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
51
51
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
- DatePercentile(),
53
52
  ]
54
53
  }
55
54
 
@@ -1,10 +1,9 @@
1
- from datetime import date
2
- from typing import Any, Dict, List, Optional, Union
1
+ from typing import Any, Optional, Union
3
2
 
4
3
  import numpy as np
5
4
  import pandas as pd
6
5
  from pandas.core.arrays.timedeltas import TimedeltaArray
7
- from pydantic import BaseModel, validator
6
+ from pydantic import BaseModel
8
7
 
9
8
  from upgini.autofe.operand import PandasOperand
10
9
 
@@ -28,17 +27,6 @@ class DateDiff(PandasOperand, DateDiffMixin):
28
27
  is_binary = True
29
28
  has_symmetry_importance = True
30
29
 
31
- def get_params(self) -> Dict[str, Optional[str]]:
32
- res = super().get_params()
33
- res.update(
34
- {
35
- "diff_unit": self.diff_unit,
36
- "left_unit": self.left_unit,
37
- "right_unit": self.right_unit,
38
- }
39
- )
40
- return res
41
-
42
30
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
43
31
  left = self._convert_to_date(left, self.left_unit)
44
32
  right = self._convert_to_date(right, self.right_unit)
@@ -54,17 +42,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
54
42
  is_binary = True
55
43
  has_symmetry_importance = True
56
44
 
57
- def get_params(self) -> Dict[str, Optional[str]]:
58
- res = super().get_params()
59
- res.update(
60
- {
61
- "diff_unit": self.diff_unit,
62
- "left_unit": self.left_unit,
63
- "right_unit": self.right_unit,
64
- }
65
- )
66
- return res
67
-
68
45
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
69
46
  left = self._convert_to_date(left, self.left_unit)
70
47
  right = self._convert_to_date(right, self.right_unit)
@@ -87,15 +64,6 @@ class DateListDiff(PandasOperand, DateDiffMixin):
87
64
  has_symmetry_importance = True
88
65
  aggregation: str
89
66
 
90
- def get_params(self) -> Dict[str, Optional[str]]:
91
- res = super().get_params()
92
- res.update(
93
- {
94
- "aggregation": self.aggregation,
95
- }
96
- )
97
- return res
98
-
99
67
  def __init__(self, **data: Any) -> None:
100
68
  if "name" not in data:
101
69
  data["name"] = f"date_diff_{data.get('aggregation')}"
@@ -148,55 +116,3 @@ class DateListDiffBounded(DateListDiff):
148
116
  def _agg(self, x):
149
117
  x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
150
118
  return super()._agg(x)
151
-
152
-
153
- class DatePercentile(PandasOperand):
154
- name = "date_per"
155
- is_binary = True
156
- output_type = "float"
157
-
158
- date_unit: Optional[str] = None
159
- zero_month: Optional[int]
160
- zero_year: Optional[int]
161
- zero_bounds: Optional[List[float]]
162
- step: int = 30
163
-
164
- def get_params(self) -> Dict[str, Optional[str]]:
165
- res = super().get_params()
166
- res.update(
167
- {
168
- "date_unit": self.date_unit,
169
- "zero_month": self.zero_month,
170
- "zero_year": self.zero_year,
171
- "zero_bounds": self.zero_bounds,
172
- "step": self.step,
173
- }
174
- )
175
- return res
176
-
177
- @validator("zero_bounds", pre=True)
178
- def validate_bounds(cls, value):
179
- if value is None or isinstance(value, list):
180
- return value
181
- elif isinstance(value, str):
182
- return value[1:-1].split(", ")
183
-
184
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
185
- # Assuming that left is a date column, right is a feature column
186
- left = pd.to_datetime(left, unit=self.date_unit)
187
- months = left.dt.month
188
- years = left.dt.year
189
-
190
- month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
191
- bounds = month_diffs.apply(
192
- lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * 30
193
- )
194
-
195
- return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
196
-
197
- def __perc(self, f, bounds):
198
- hit = np.where(f >= bounds)[0]
199
- if hit.size > 0:
200
- return np.max(hit) * 10
201
- else:
202
- return np.nan
@@ -16,12 +16,6 @@ class Column:
16
16
  self.data = data
17
17
  self.calculate_all = calculate_all
18
18
 
19
- def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
20
- return self.name
21
-
22
- def set_op_params(self, params: Dict[str, str]) -> "Column":
23
- return self
24
-
25
19
  def rename_columns(self, mapping: Dict[str, str]) -> "Column":
26
20
  self.name = self._unhash(mapping.get(self.name) or self.name)
27
21
  return self
@@ -75,20 +69,14 @@ class Feature:
75
69
  self.cached_display_name = cached_display_name
76
70
  self.alias = alias
77
71
 
78
- def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
79
- obj_dict = self.op.dict().copy()
80
- obj_dict.update(params or {})
81
- self.op = self.op.__class__.parse_obj(obj_dict)
72
+ def set_op_params(self, params: Dict[str, str]) -> "Feature":
82
73
  self.op.set_params(params)
83
-
84
- for child in self.children:
85
- child.set_op_params(params)
86
74
  return self
87
75
 
88
76
  def get_hash(self) -> str:
89
- return hashlib.sha256(
90
- "_".join([self.op.name] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
91
- ).hexdigest()[:8]
77
+ return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
78
+ :8
79
+ ]
92
80
 
93
81
  def set_alias(self, alias: str) -> "Feature":
94
82
  self.alias = alias
@@ -25,10 +25,8 @@ class Operand(BaseModel):
25
25
  self.params = params
26
26
  return self
27
27
 
28
- def get_params(self) -> Dict[str, Optional[str]]:
29
- res = {"alias": self.alias}
30
- res.update(self.params or {})
31
- return res
28
+ def get_params(self) -> Dict[str, str]:
29
+ return self.params
32
30
 
33
31
 
34
32
  MAIN_COLUMN = "main_column"
@@ -1333,6 +1333,9 @@ class FeaturesEnricher(TransformerMixin):
1333
1333
  excluding_search_keys = list(search_keys.keys())
1334
1334
  if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
1335
1335
  excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
1336
+ meta = self._search_task.get_all_features_metadata_v2()
1337
+ zero_importance_client_features = [m for m in meta if m.source == "etalon" and m.shap_value == 0.0]
1338
+
1336
1339
  client_features = [
1337
1340
  c
1338
1341
  for c in X_sampled.columns.to_list()
@@ -1341,6 +1344,7 @@ class FeaturesEnricher(TransformerMixin):
1341
1344
  excluding_search_keys
1342
1345
  + list(self.fit_dropped_features)
1343
1346
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
1347
+ + zero_importance_client_features
1344
1348
  )
1345
1349
  ]
1346
1350
 
@@ -2845,8 +2849,10 @@ class FeaturesEnricher(TransformerMixin):
2845
2849
  maybe_date_col = self._get_date_column(self.search_keys)
2846
2850
  if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
2847
2851
  # TODO cast date column to single dtype
2848
- min_date = X[maybe_date_col].min()
2849
- max_date = X[maybe_date_col].max()
2852
+ date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
2853
+ converted_X = date_converter.convert(X)
2854
+ min_date = converted_X[maybe_date_col].min()
2855
+ max_date = converted_X[maybe_date_col].max()
2850
2856
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
2851
2857
 
2852
2858
  except Exception:
@@ -1 +0,0 @@
1
- __version__ = "1.1.280a3418-10"