upgini 1.1.297__tar.gz → 1.1.298__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.297 → upgini-1.1.298}/PKG-INFO +3 -3
  2. {upgini-1.1.297 → upgini-1.1.298}/README.md +2 -2
  3. upgini-1.1.298/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/all_operands.py +9 -1
  5. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/date.py +46 -16
  6. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/feature.py +27 -5
  7. upgini-1.1.297/src/upgini/__about__.py +0 -1
  8. {upgini-1.1.297 → upgini-1.1.298}/.gitignore +0 -0
  9. {upgini-1.1.297 → upgini-1.1.298}/LICENSE +0 -0
  10. {upgini-1.1.297 → upgini-1.1.298}/pyproject.toml +0 -0
  11. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/__init__.py +0 -0
  12. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/ads.py +0 -0
  13. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/groupby.py +0 -0
  18. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/operand.py +0 -0
  19. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/unary.py +0 -0
  20. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/autofe/vector.py +0 -0
  21. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/data_source/__init__.py +0 -0
  22. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/data_source/data_source_publisher.py +0 -0
  23. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/dataset.py +0 -0
  24. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/errors.py +0 -0
  25. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/features_enricher.py +0 -0
  26. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/http.py +0 -0
  27. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/lazy_import.py +0 -0
  28. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/mdc/__init__.py +0 -0
  29. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/mdc/context.py +0 -0
  30. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/metadata.py +0 -0
  31. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.297 → upgini-1.1.298}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.297
3
+ Version: 1.1.298
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -131,7 +131,7 @@ Description-Content-Type: text/markdown
131
131
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
132
132
  |World economic indicators|191 |41|-|Monthly|date, country|No
133
133
  |Markets data|-|17|-|Monthly|date, datetime|No
134
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
134
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
135
135
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
136
136
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
137
137
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -840,4 +840,4 @@ Some convenient ways to start contributing are:
840
840
  - [More perks for registered users](https://profile.upgini.com)
841
841
 
842
842
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
843
- Please report it here</a></sup>
843
+ Please report it here</a></sup>
@@ -90,7 +90,7 @@
90
90
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
91
91
  |World economic indicators|191 |41|-|Monthly|date, country|No
92
92
  |Markets data|-|17|-|Monthly|date, datetime|No
93
- |World mobile & fixed broadband network coverage and perfomance |167|-|3|Monthly|country, postal/ZIP code|No
93
+ |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
94
94
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
95
95
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
96
96
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -799,4 +799,4 @@ Some convenient ways to start contributing are:
799
799
  - [More perks for registered users](https://profile.upgini.com)
800
800
 
801
801
  <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
802
- Please report it here</a></sup>
802
+ Please report it here</a></sup>
@@ -0,0 +1 @@
1
+ __version__ = "1.1.298"
@@ -1,7 +1,14 @@
1
1
  from typing import Dict
2
2
 
3
3
  from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
- from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
4
+ from upgini.autofe.date import (
5
+ DateDiff,
6
+ DateDiffType2,
7
+ DateListDiff,
8
+ DateListDiffBounded,
9
+ DatePercentile,
10
+ DatePercentileMethod2,
11
+ )
5
12
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
6
13
  from upgini.autofe.operand import Operand
7
14
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
@@ -50,6 +57,7 @@ ALL_OPERANDS: Dict[str, Operand] = {
50
57
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
51
58
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
52
59
  DatePercentile(),
60
+ DatePercentileMethod2(),
53
61
  Norm(),
54
62
  ]
55
63
  }
@@ -1,3 +1,4 @@
1
+ import abc
1
2
  from typing import Any, Dict, List, Optional, Union
2
3
 
3
4
  import numpy as np
@@ -38,6 +39,7 @@ class DateDiffMixin(BaseModel):
38
39
 
39
40
  class DateDiff(PandasOperand, DateDiffMixin):
40
41
  name = "date_diff"
42
+ alias = "date_diff_type1"
41
43
  is_binary = True
42
44
  has_symmetry_importance = True
43
45
 
@@ -159,12 +161,45 @@ class DateListDiffBounded(DateListDiff):
159
161
  return super()._agg(x)
160
162
 
161
163
 
162
- class DatePercentile(PandasOperand):
163
- name = "date_per"
164
+ class DatePercentileBase(PandasOperand, abc.ABC):
164
165
  is_binary = True
165
166
  output_type = "float"
166
167
 
167
168
  date_unit: Optional[str] = None
169
+
170
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
171
+ # Assuming that left is a date column, right is a feature column
172
+ left = pd.to_datetime(left, unit=self.date_unit)
173
+
174
+ bounds = self._get_bounds(left)
175
+
176
+ return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
177
+
178
+ @abc.abstractmethod
179
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
180
+ pass
181
+
182
+ def _perc(self, f, bounds):
183
+ hit = np.where(f >= bounds)[0]
184
+ if hit.size > 0:
185
+ return np.max(hit) + 1
186
+ else:
187
+ return np.nan
188
+
189
+ def get_params(self) -> Dict[str, Optional[str]]:
190
+ res = super().get_params()
191
+ res.update(
192
+ {
193
+ "date_unit": self.date_unit,
194
+ }
195
+ )
196
+ return res
197
+
198
+
199
+ class DatePercentile(DatePercentileBase):
200
+ name = "date_per"
201
+ alias = "date_per_method1"
202
+
168
203
  zero_month: Optional[int]
169
204
  zero_year: Optional[int]
170
205
  zero_bounds: Optional[List[float]]
@@ -174,7 +209,6 @@ class DatePercentile(PandasOperand):
174
209
  res = super().get_params()
175
210
  res.update(
176
211
  {
177
- "date_unit": self.date_unit,
178
212
  "zero_month": self.zero_month,
179
213
  "zero_year": self.zero_year,
180
214
  "zero_bounds": self.zero_bounds,
@@ -190,22 +224,18 @@ class DatePercentile(PandasOperand):
190
224
  elif isinstance(value, str):
191
225
  return value[1:-1].split(", ")
192
226
 
193
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
194
- # Assuming that left is a date column, right is a feature column
195
- left = pd.to_datetime(left, unit=self.date_unit)
196
- months = left.dt.month
197
- years = left.dt.year
227
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
228
+ months = date_col.dt.month
229
+ years = date_col.dt.year
198
230
 
199
231
  month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
200
- bounds = month_diffs.apply(
232
+ return month_diffs.apply(
201
233
  lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
202
234
  )
203
235
 
204
- return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
205
236
 
206
- def __perc(self, f, bounds):
207
- hit = np.where(f >= bounds)[0]
208
- if hit.size > 0:
209
- return np.max(hit) + 1
210
- else:
211
- return np.nan
237
+ class DatePercentileMethod2(DatePercentileBase):
238
+ name = "date_per_method2"
239
+
240
+ def _get_bounds(self, date_col: pd.Series) -> pd.Series:
241
+ pass
@@ -41,7 +41,7 @@ class Column:
41
41
  def get_column_nodes(self) -> List["Column"]:
42
42
  return [self]
43
43
 
44
- def get_columns(self) -> List[str]:
44
+ def get_columns(self, **kwargs) -> List[str]:
45
45
  return [self.name]
46
46
 
47
47
  def infer_type(self, data: pd.DataFrame) -> DtypeObj:
@@ -57,6 +57,12 @@ class Column:
57
57
  def to_pretty_formula(self) -> str:
58
58
  return self.to_formula()
59
59
 
60
+ def __eq__(self, value: object) -> bool:
61
+ if not isinstance(value, Column):
62
+ return False
63
+ else:
64
+ return self.name == value.name and self.calculate_all == value.calculate_all
65
+
60
66
 
61
67
  class Feature:
62
68
  def __init__(
@@ -125,6 +131,9 @@ class Feature:
125
131
  for child in self.children:
126
132
  child.delete_data()
127
133
 
134
+ def get_op_display_name(self) -> str:
135
+ return self.op.alias or self.op.name.lower()
136
+
128
137
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
129
138
  if self.cached_display_name is not None and cache:
130
139
  return self.cached_display_name
@@ -132,11 +141,11 @@ class Feature:
132
141
  if self.alias:
133
142
  components = ["f_autofe", self.alias]
134
143
  elif shorten and not self.op.is_unary:
135
- components = ["f_autofe", self.op.alias or self.op.name.lower()]
144
+ components = ["f_autofe", self.get_op_display_name()]
136
145
  else:
137
146
  components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
138
147
  "autofe",
139
- self.op.alias or self.op.name.lower(),
148
+ self.get_op_display_name(),
140
149
  ]
141
150
  components.extend([str(self.display_index)] if self.display_index is not None else [])
142
151
  display_name = "_".join(components)
@@ -306,8 +315,21 @@ class FeatureGroup:
306
315
  main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
307
316
  if isinstance(self.op, PandasOperand):
308
317
  columns = self.get_columns()
309
- new_data = self.op.calculate_group(data[columns], main_column=main_column)
310
- new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
318
+ lower_order_children = [
319
+ ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
320
+ ]
321
+ lower_order_names = [ch.get_display_name() for ch in lower_order_children]
322
+ if any(isinstance(f, Feature) for f in lower_order_children):
323
+ child_data = pd.concat(
324
+ [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
325
+ axis=1,
326
+ )
327
+ child_data.columns = [main_column] + lower_order_names
328
+ else:
329
+ child_data = data[columns]
330
+
331
+ new_data = self.op.calculate_group(child_data, main_column=main_column)
332
+ new_data.rename(columns=dict(zip(lower_order_names, self.get_display_names())), inplace=True)
311
333
  else:
312
334
  raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
313
335
 
@@ -1 +0,0 @@
1
- __version__ = "1.1.297"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes