upgini 1.1.296a3521.dev9__tar.gz → 1.1.297__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (65) hide show
  1. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/PKG-INFO +1 -1
  2. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/pyproject.toml +1 -1
  3. upgini-1.1.297/src/upgini/__about__.py +1 -0
  4. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/all_operands.py +1 -9
  5. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/date.py +16 -46
  6. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/feature.py +6 -31
  7. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/data_source/data_source_publisher.py +37 -0
  8. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/features_enricher.py +1 -1
  9. upgini-1.1.296a3521.dev9/src/upgini/__about__.py +0 -1
  10. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/.gitignore +0 -0
  11. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/LICENSE +0 -0
  12. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/README.md +0 -0
  13. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/__init__.py +0 -0
  14. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/ads.py +0 -0
  15. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/operand.py +0 -0
  21. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/unary.py +0 -0
  22. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/autofe/vector.py +0 -0
  23. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/data_source/__init__.py +0 -0
  24. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/dataset.py +0 -0
  25. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/errors.py +0 -0
  26. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/http.py +0 -0
  27. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/lazy_import.py +0 -0
  28. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/mdc/__init__.py +0 -0
  29. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/mdc/context.py +0 -0
  30. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/metadata.py +0 -0
  31. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/metrics.py +0 -0
  32. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/normalizer/__init__.py +0 -0
  33. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/normalizer/phone_normalizer.py +0 -0
  34. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/resource_bundle/__init__.py +0 -0
  35. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/resource_bundle/exceptions.py +0 -0
  36. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/resource_bundle/strings.properties +0 -0
  37. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/search_task.py +0 -0
  43. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/spinner.py +0 -0
  44. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/__init__.py +0 -0
  45. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/base_search_key_detector.py +0 -0
  46. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/blocked_time_series.py +0 -0
  47. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/country_utils.py +0 -0
  48. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/custom_loss_utils.py +0 -0
  49. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/cv_utils.py +0 -0
  50. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/datetime_utils.py +0 -0
  51. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/deduplicate_utils.py +0 -0
  52. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/display_utils.py +0 -0
  53. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/email_utils.py +0 -0
  54. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/fallback_progress_bar.py +0 -0
  55. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/features_validator.py +0 -0
  56. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.296a3521.dev9 → upgini-1.1.297}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.1.296a3521.dev9
3
+ Version: 1.1.297
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -47,8 +47,8 @@ dependencies = [
47
47
  "python-json-logger>=2.0.2",
48
48
  "requests>=2.8.0",
49
49
  "scikit-learn>=1.3.0",
50
- "xhtml2pdf==0.2.11",
51
50
  "python-bidi==0.4.2",
51
+ "xhtml2pdf==0.2.11",
52
52
  ]
53
53
 
54
54
  [project.urls]
@@ -0,0 +1 @@
1
+ __version__ = "1.1.297"
@@ -1,14 +1,7 @@
1
1
  from typing import Dict
2
2
 
3
3
  from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
4
- from upgini.autofe.date import (
5
- DateDiff,
6
- DateDiffType2,
7
- DateListDiff,
8
- DateListDiffBounded,
9
- DatePercentile,
10
- DatePercentileMethod2,
11
- )
4
+ from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded, DatePercentile
12
5
  from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
13
6
  from upgini.autofe.operand import Operand
14
7
  from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Norm, Sigmoid, Sqrt, Square
@@ -57,7 +50,6 @@ ALL_OPERANDS: Dict[str, Operand] = {
57
50
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
58
51
  DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
59
52
  DatePercentile(),
60
- DatePercentileMethod2(),
61
53
  Norm(),
62
54
  ]
63
55
  }
@@ -1,4 +1,3 @@
1
- import abc
2
1
  from typing import Any, Dict, List, Optional, Union
3
2
 
4
3
  import numpy as np
@@ -39,7 +38,6 @@ class DateDiffMixin(BaseModel):
39
38
 
40
39
  class DateDiff(PandasOperand, DateDiffMixin):
41
40
  name = "date_diff"
42
- alias = "date_diff_type1"
43
41
  is_binary = True
44
42
  has_symmetry_importance = True
45
43
 
@@ -161,45 +159,12 @@ class DateListDiffBounded(DateListDiff):
161
159
  return super()._agg(x)
162
160
 
163
161
 
164
- class DatePercentileBase(PandasOperand, abc.ABC):
162
+ class DatePercentile(PandasOperand):
163
+ name = "date_per"
165
164
  is_binary = True
166
165
  output_type = "float"
167
166
 
168
167
  date_unit: Optional[str] = None
169
-
170
- def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
171
- # Assuming that left is a date column, right is a feature column
172
- left = pd.to_datetime(left, unit=self.date_unit)
173
-
174
- bounds = self._get_bounds(left)
175
-
176
- return right.index.to_series().apply(lambda i: self._perc(right[i], bounds[i]))
177
-
178
- @abc.abstractmethod
179
- def _get_bounds(self, date_col: pd.Series) -> pd.Series:
180
- pass
181
-
182
- def _perc(self, f, bounds):
183
- hit = np.where(f >= bounds)[0]
184
- if hit.size > 0:
185
- return np.max(hit) + 1
186
- else:
187
- return np.nan
188
-
189
- def get_params(self) -> Dict[str, Optional[str]]:
190
- res = super().get_params()
191
- res.update(
192
- {
193
- "date_unit": self.date_unit,
194
- }
195
- )
196
- return res
197
-
198
-
199
- class DatePercentile(DatePercentileBase):
200
- name = "date_per"
201
- alias = "date_per_method1"
202
-
203
168
  zero_month: Optional[int]
204
169
  zero_year: Optional[int]
205
170
  zero_bounds: Optional[List[float]]
@@ -209,6 +174,7 @@ class DatePercentile(DatePercentileBase):
209
174
  res = super().get_params()
210
175
  res.update(
211
176
  {
177
+ "date_unit": self.date_unit,
212
178
  "zero_month": self.zero_month,
213
179
  "zero_year": self.zero_year,
214
180
  "zero_bounds": self.zero_bounds,
@@ -224,18 +190,22 @@ class DatePercentile(DatePercentileBase):
224
190
  elif isinstance(value, str):
225
191
  return value[1:-1].split(", ")
226
192
 
227
- def _get_bounds(self, date_col: pd.Series) -> pd.Series:
228
- months = date_col.dt.month
229
- years = date_col.dt.year
193
+ def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
194
+ # Assuming that left is a date column, right is a feature column
195
+ left = pd.to_datetime(left, unit=self.date_unit)
196
+ months = left.dt.month
197
+ years = left.dt.year
230
198
 
231
199
  month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
232
- return month_diffs.apply(
200
+ bounds = month_diffs.apply(
233
201
  lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
234
202
  )
235
203
 
204
+ return right.index.to_series().apply(lambda i: self.__perc(right[i], bounds[i]))
236
205
 
237
- class DatePercentileMethod2(DatePercentileBase):
238
- name = "date_per_method2"
239
-
240
- def _get_bounds(self, date_col: pd.Series) -> pd.Series:
241
- pass
206
+ def __perc(self, f, bounds):
207
+ hit = np.where(f >= bounds)[0]
208
+ if hit.size > 0:
209
+ return np.max(hit) + 1
210
+ else:
211
+ return np.nan
@@ -16,9 +16,6 @@ class Column:
16
16
  self.data = data
17
17
  self.calculate_all = calculate_all
18
18
 
19
- def get_name_component(self, **kwargs) -> str:
20
- return self.name
21
-
22
19
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
23
20
  return self.name
24
21
 
@@ -44,7 +41,7 @@ class Column:
44
41
  def get_column_nodes(self) -> List["Column"]:
45
42
  return [self]
46
43
 
47
- def get_columns(self, **kwargs) -> List[str]:
44
+ def get_columns(self) -> List[str]:
48
45
  return [self.name]
49
46
 
50
47
  def infer_type(self, data: pd.DataFrame) -> DtypeObj:
@@ -60,12 +57,6 @@ class Column:
60
57
  def to_pretty_formula(self) -> str:
61
58
  return self.to_formula()
62
59
 
63
- def __eq__(self, value: object) -> bool:
64
- if not isinstance(value, Column):
65
- return False
66
- else:
67
- return self.name == value.name and self.calculate_all == value.calculate_all
68
-
69
60
 
70
61
  class Feature:
71
62
  def __init__(
@@ -134,9 +125,6 @@ class Feature:
134
125
  for child in self.children:
135
126
  child.delete_data()
136
127
 
137
- def get_op_display_name(self) -> str:
138
- return self.op.alias or self.op.name.lower()
139
-
140
128
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
141
129
  if self.cached_display_name is not None and cache:
142
130
  return self.cached_display_name
@@ -144,11 +132,11 @@ class Feature:
144
132
  if self.alias:
145
133
  components = ["f_autofe", self.alias]
146
134
  elif shorten and not self.op.is_unary:
147
- components = ["f_autofe", self.get_op_display_name()]
135
+ components = ["f_autofe", self.op.alias or self.op.name.lower()]
148
136
  else:
149
- components = ["f_" + "_f_".join(self.get_columns())] + [
137
+ components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + [
150
138
  "autofe",
151
- self.get_op_display_name(),
139
+ self.op.alias or self.op.name.lower(),
152
140
  ]
153
141
  components.extend([str(self.display_index)] if self.display_index is not None else [])
154
142
  display_name = "_".join(components)
@@ -318,21 +306,8 @@ class FeatureGroup:
318
306
  main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
319
307
  if isinstance(self.op, PandasOperand):
320
308
  columns = self.get_columns()
321
- lower_order_children = [
322
- ch for f in self.children for ch in f.children if ch.get_display_name() != main_column
323
- ]
324
- lower_order_names = [ch.get_display_name() for ch in lower_order_children]
325
- if any(isinstance(f, Feature) for f in lower_order_children):
326
- child_data = pd.concat(
327
- [data[main_column]] + [ch.calculate(data) for ch in lower_order_children],
328
- axis=1,
329
- )
330
- child_data.columns = [main_column] + lower_order_names
331
- else:
332
- child_data = data[columns]
333
-
334
- new_data = self.op.calculate_group(child_data, main_column=main_column)
335
- new_data.rename(columns=dict(zip(lower_order_names, self.get_display_names())), inplace=True)
309
+ new_data = self.op.calculate_group(data[columns], main_column=main_column)
310
+ new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
336
311
  else:
337
312
  raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
338
313
 
@@ -59,9 +59,35 @@ class DataSourcePublisher:
59
59
  features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
60
60
  data_table_id_to_replace: Optional[str] = None,
61
61
  keep_features: Optional[List[str]] = None,
62
+ date_features: Optional[List[str]] = None,
63
+ date_vector_features: Optional[List[str]] = None,
62
64
  _force_generation=False,
63
65
  _silent=False,
64
66
  ) -> str:
67
+ """Register new ADS
68
+
69
+ Parameters
70
+ ----------
71
+ data_table_uri - str - table name in format {project_id}.{datasource_name}.{table_name}
72
+
73
+ search_keys - dict with column names as keys and SearchKey as value
74
+
75
+ update_frequency - str - (Monthly, Weekly, Daily, Annually, Quarterly)
76
+
77
+ exclude_from_autofe_generation - optional list of features that should be excluded from AutoFE
78
+
79
+ secondary_search_keys - optional dict of secondary search keys
80
+
81
+ sort_column - optional str - name of unique column that could be used for sort
82
+
83
+ date_format - optional str - format of date if it is present in search keys
84
+
85
+ ...
86
+
87
+ data_table_id_to_replace - optional str - id of registered ADS that should be replaced by new table
88
+
89
+ keep_features - optional list - features that should not be removed from ADS (even if they are personal)
90
+ """
65
91
  trace_id = str(uuid.uuid4())
66
92
 
67
93
  with MDC(trace_id=trace_id):
@@ -124,6 +150,14 @@ class DataSourcePublisher:
124
150
  request["excludeFromGeneration"] = exclude_from_autofe_generation
125
151
  if keep_features is not None:
126
152
  request["keepFeatures"] = keep_features
153
+ if date_features is not None:
154
+ if date_format is None:
155
+ raise ValidationError("date_format should be presented if you use date features")
156
+ request["dateFeatures"] = date_features
157
+ if date_vector_features is not None:
158
+ if date_format is None:
159
+ raise ValidationError("date_format should be presented if you use date vector features")
160
+ request["dateVectorFeatures"] = date_vector_features
127
161
  self.logger.info(f"Start registering data table {request}")
128
162
 
129
163
  task_id = self._rest_client.register_ads(request, trace_id)
@@ -181,6 +215,9 @@ class DataSourcePublisher:
181
215
  msg = f"Data table successfully registered with id: {data_table_id}"
182
216
  self.logger.info(msg)
183
217
  print(msg)
218
+ if "warnings" in status_response and status_response["warnings"]:
219
+ self.logger.warning(status_response["warnings"])
220
+ print(status_response["warnings"])
184
221
  return data_table_id
185
222
  except KeyboardInterrupt:
186
223
  if task_id is not None:
@@ -2870,7 +2870,7 @@ class FeaturesEnricher(TransformerMixin):
2870
2870
  self.logger.info(f"Dates interval is ({min_date}, {max_date})")
2871
2871
 
2872
2872
  except Exception:
2873
- self.logger.exception("Failed to log debug information")
2873
+ self.logger.warning("Failed to log debug information", exc_info=True)
2874
2874
 
2875
2875
  def __handle_index_search_keys(self, df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> pd.DataFrame:
2876
2876
  index_names = df.index.names if df.index.names != [None] else [DEFAULT_INDEX]
@@ -1 +0,0 @@
1
- __version__ = "1.1.296a3521.dev9"
File without changes
File without changes
File without changes