upgini 1.1.231a2__py3-none-any.whl → 1.1.232a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/autofe/feature.py CHANGED
@@ -1,78 +1,61 @@
1
1
  import hashlib
2
- from typing import Dict
2
+ import itertools
3
+ from typing import Dict, List, Optional, Tuple, Union
4
+
3
5
  import numpy as np
4
6
  import pandas as pd
5
- import itertools
6
- from upgini.autofe.operand import PandasOperand
7
- from upgini.autofe.all_operands import (
8
- find_op,
9
- )
7
+ from pandas._typing import DtypeObj
10
8
 
9
+ from upgini.autofe.all_operands import find_op
10
+ from upgini.autofe.operand import Operand, PandasOperand
11
11
 
12
- class FeatureGroup(object):
13
- def __init__(self, op, main_column, children):
14
- self.op = op
15
- self.main_column_node = main_column
16
- self.children = children
17
- self.data = None
18
-
19
- def get_columns(self, **kwargs):
20
- column_list = []
21
- seen = set()
22
- for child in self.children:
23
- columns = child.get_columns(**kwargs)
24
- column_list.extend([f for f in columns if f not in seen])
25
- seen.update(columns)
26
- return column_list
27
12
 
28
- def get_display_names(self, **kwargs):
29
- names = [f.get_display_name(**kwargs) for f in self.children]
30
- return names
13
+ class Column:
14
+ def __init__(self, name: str, data: Optional[pd.Series] = None, calculate_all=False):
15
+ self.name = name
16
+ self.data = data
17
+ self.calculate_all = calculate_all
31
18
 
32
- def calculate(self, data: pd.DataFrame, is_root=False):
33
- main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
34
- if isinstance(self.op, PandasOperand):
35
- columns = self.get_columns()
36
- new_data = self.op.calculate_group(data[columns], main_column=main_column)
37
- new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
19
+ def rename_columns(self, mapping: Dict[str, str]) -> "Column":
20
+ self.name = self._unhash(mapping.get(self.name) or self.name)
21
+ return self
38
22
 
23
+ def _unhash(self, feature_name: str) -> str:
24
+ last_component_idx = feature_name.rfind("_")
25
+ if not feature_name.startswith("f_"):
26
+ return feature_name # etalon feature
27
+ elif last_component_idx == 1:
28
+ return feature_name[2:] # fully hashed name, cannot unhash
39
29
  else:
40
- raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
30
+ return feature_name[2:last_component_idx]
41
31
 
42
- new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
32
+ def delete_data(self):
33
+ self.data = None
43
34
 
44
- if is_root:
45
- self.data = new_data
46
- return new_data
35
+ def get_column_nodes(self) -> List["Column"]:
36
+ return [self]
47
37
 
48
- @staticmethod
49
- def make_groups(candidates):
50
- grouped_features = []
51
- for op_child, features in itertools.groupby(
52
- candidates, lambda f: (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
53
- ):
54
- op, main_child = op_child
55
- feature_list = list(features)
56
- if op.is_vectorizable:
57
- if op.is_unary:
58
- group = FeatureGroup(op, main_column=None, children=feature_list)
59
- else:
60
- group = FeatureGroup(op, main_column=main_child, children=feature_list)
61
- grouped_features.append(group)
62
- else:
63
- grouped_features.extend(feature_list)
64
- return grouped_features
38
+ def get_columns(self) -> List[str]:
39
+ return [self.name]
65
40
 
66
- def delete_data(self):
67
- self.data = None
68
- if self.main_column_node:
69
- self.main_column_node.delete_data()
70
- for child in self.children:
71
- child.delete_data()
41
+ def infer_type(self, data: pd.DataFrame) -> DtypeObj:
42
+ return data[self.name].dtype
72
43
 
44
+ def calculate(self, data: pd.DataFrame) -> pd.Series:
45
+ self.data = data[self.name]
46
+ return self.data
47
+
48
+ def to_formula(self, **kwargs) -> str:
49
+ return str(self.get_columns(**kwargs)[0])
73
50
 
74
- class Feature(object):
75
- def __init__(self, op, children, data=None, display_index=None, cached_display_name=None, alias=None):
51
+ def to_pretty_formula(self) -> str:
52
+ return self.to_formula()
53
+
54
+
55
+ class Feature:
56
+ def __init__(self, op: Operand, children: List[Union[Column, "Feature"]], data: Optional[pd.DataFrame] = None,
57
+ display_index: Optional[str] = None, cached_display_name: Optional[str] = None,
58
+ alias: Optional[str] = None):
76
59
  self.op = op
77
60
  self.children = children
78
61
  self.data = data
@@ -80,32 +63,32 @@ class Feature(object):
80
63
  self.cached_display_name = cached_display_name
81
64
  self.alias = alias
82
65
 
83
- def set_op_params(self, params: Dict):
66
+ def set_op_params(self, params: Dict[str, str]) -> "Feature":
84
67
  self.op.set_params(params)
85
68
  return self
86
69
 
87
- def get_hash(self):
70
+ def get_hash(self) -> str:
88
71
  return hashlib.sha256("_".join([self.op.name] + [ch.name for ch in self.children]).encode("utf-8")).hexdigest()[
89
72
  :8
90
73
  ]
91
74
 
92
- def set_alias(self, alias):
75
+ def set_alias(self, alias: str) -> "Feature":
93
76
  self.alias = alias
94
77
  return self
95
78
 
96
- def rename_columns(self, mapping: Dict):
79
+ def rename_columns(self, mapping: Dict[str, str]) -> "Feature":
97
80
  for child in self.children:
98
81
  child.rename_columns(mapping)
99
82
  self.cached_display_name = None
100
83
  return self
101
84
 
102
- def get_column_nodes(self):
85
+ def get_column_nodes(self) -> List[Union[Column, "Feature"]]:
103
86
  res = []
104
87
  for child in self.children:
105
88
  res.extend(child.get_column_nodes())
106
89
  return res
107
90
 
108
- def get_columns(self, **kwargs):
91
+ def get_columns(self, **kwargs) -> List[str]:
109
92
  column_list = []
110
93
  seen = set()
111
94
  for child in self.children:
@@ -119,7 +102,7 @@ class Feature(object):
119
102
  for child in self.children:
120
103
  child.delete_data()
121
104
 
122
- def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs):
105
+ def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
123
106
  if self.cached_display_name is not None and cache:
124
107
  return self.cached_display_name
125
108
 
@@ -139,27 +122,27 @@ class Feature(object):
139
122
  self.cached_display_name = display_name
140
123
  return display_name
141
124
 
142
- def set_display_index(self, index):
125
+ def set_display_index(self, index) -> "Feature":
143
126
  self.display_index = index
144
127
  self.cached_display_name = None
145
128
  return self
146
129
 
147
- def infer_type(self, data):
130
+ def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
148
131
  if self.op.output_type:
149
132
  return self.op.output_type
150
133
  else:
151
134
  # either a symmetrical operator or group by
152
135
  return self.children[0].infer_type(data)
153
136
 
154
- def calculate(self, data, is_root=False):
137
+ def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
155
138
  if isinstance(self.op, PandasOperand) and self.op.is_vector:
156
- ds = [child.calculate(data) for child in self.children]
157
- new_data = self.op.calculate(data=ds)
158
-
159
- elif isinstance(self.op, PandasOperand):
160
- d1 = self.children[0].calculate(data)
161
- d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
162
- new_data = self.op.calculate(data=d1, left=d1, right=d2)
139
+ if self.op.is_vector:
140
+ ds = [child.calculate(data) for child in self.children]
141
+ new_data = self.op.calculate(data=ds)
142
+ else:
143
+ d1 = self.children[0].calculate(data)
144
+ d2 = None if len(self.children) < 2 else self.children[1].calculate(data)
145
+ new_data = self.op.calculate(data=d1, left=d1, right=d2)
163
146
  else:
164
147
  raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
165
148
 
@@ -173,8 +156,8 @@ class Feature(object):
173
156
  return new_data
174
157
 
175
158
  @staticmethod
176
- def check_xor(left, right):
177
- def _get_all_columns(feature):
159
+ def check_xor(left: Union[Column, "Feature"], right: Union[Column, "Feature"]) -> bool:
160
+ def _get_all_columns(feature: Union[Column, "Feature"]) -> List[str]:
178
161
  if isinstance(feature, Column):
179
162
  return [feature.name]
180
163
  else:
@@ -190,7 +173,7 @@ class Feature(object):
190
173
  else:
191
174
  return True
192
175
 
193
- def to_formula(self, **kwargs):
176
+ def to_formula(self, **kwargs) -> str:
194
177
  if self.op.name in ["+", "-", "*", "/"]:
195
178
  left = self.children[0].to_formula(**kwargs)
196
179
  right = self.children[1].to_formula(**kwargs)
@@ -205,15 +188,30 @@ class Feature(object):
205
188
  result.append(")")
206
189
  return "".join(result)
207
190
 
191
+ def to_pretty_formula(self) -> str:
192
+ if self.op.name in ["+", "-", "*", "/"]:
193
+ left = self.children[0].to_pretty_formula()
194
+ right = self.children[1].to_pretty_formula()
195
+ return f"{left} {self.op.name} {right}"
196
+ else:
197
+ result = [self.op.name, "("]
198
+ for i in range(len(self.children)):
199
+ string_i = self.children[i].to_pretty_formula()
200
+ result.append(string_i)
201
+ result.append(", ")
202
+ result.pop()
203
+ result.append(")")
204
+ return "".join(result)
205
+
208
206
  @staticmethod
209
- def from_formula(string):
207
+ def from_formula(string: str) -> Union[Column, "Feature"]:
210
208
  if string[-1] != ")":
211
209
  return Column(string)
212
210
 
213
- def is_trivial_char(c):
211
+ def is_trivial_char(c: str) -> bool:
214
212
  return not (c in "()+-*/,")
215
213
 
216
- def find_prev(string):
214
+ def find_prev(string: str) -> int:
217
215
  if string[-1] != ")":
218
216
  return max([(0 if is_trivial_char(c) else i + 1) for i, c in enumerate(string)])
219
217
  level, pos = 0, -1
@@ -259,40 +257,65 @@ class Feature(object):
259
257
  return Feature(op, base_features)
260
258
 
261
259
 
262
- class Column(object):
263
- def __init__(self, name, data=None, calculate_all=False):
264
- self.name = name
265
- self.data = data
266
- self.calculate_all = calculate_all
260
+ class FeatureGroup:
261
+ def __init__(self, op: Operand, main_column: Optional[Union[Column, Feature]],
262
+ children: List[Union[Column, Feature]]):
263
+ self.op = op
264
+ self.main_column_node = main_column
265
+ self.children = children
266
+ self.data: Optional[pd.DataFrame] = None
267
267
 
268
- def rename_columns(self, mapping: Dict):
269
- self.name = self._unhash(mapping.get(self.name) or self.name)
270
- return self
268
+ def get_columns(self, **kwargs) -> List[str]:
269
+ column_list = []
270
+ seen = set()
271
+ for child in self.children:
272
+ columns = child.get_columns(**kwargs)
273
+ column_list.extend([f for f in columns if f not in seen])
274
+ seen.update(columns)
275
+ return column_list
271
276
 
272
- def _unhash(self, feature_name):
273
- last_component_idx = feature_name.rfind("_")
274
- if not feature_name.startswith("f_"):
275
- return feature_name # etalon feature
276
- elif last_component_idx == 1:
277
- return feature_name[2:] # fully hashed name, cannot unhash
277
+ def get_display_names(self, **kwargs) -> List[str]:
278
+ names = [f.get_display_name(**kwargs) for f in self.children]
279
+ return names
280
+
281
+ def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
282
+ main_column = None if self.main_column_node is None else self.main_column_node.get_columns()[0]
283
+ if isinstance(self.op, PandasOperand):
284
+ columns = self.get_columns()
285
+ new_data = self.op.calculate_group(data[columns], main_column=main_column)
286
+ new_data.rename(columns=dict(zip(columns, self.get_display_names())), inplace=True)
278
287
  else:
279
- return feature_name[2:last_component_idx]
288
+ raise NotImplementedError(f"Unrecognized operator {self.op.name}.")
280
289
 
281
- def delete_data(self):
282
- self.data = None
290
+ new_data.replace([-np.inf, np.inf], np.nan, inplace=True)
283
291
 
284
- def get_column_nodes(self):
285
- return [self]
292
+ if is_root:
293
+ self.data = new_data
294
+ return new_data
286
295
 
287
- def get_columns(self):
288
- return [self.name]
296
+ @staticmethod
297
+ def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
298
+ grouped_features = []
289
299
 
290
- def infer_type(self, data):
291
- return data[self.name].dtype
300
+ def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
301
+ return (f.op, f.children[0] if f.op.is_unary or f.op.is_vector else f.children[1])
292
302
 
293
- def calculate(self, data):
294
- self.data = data[self.name]
295
- return self.data
303
+ for op_child, features in itertools.groupby(candidates, groupby_func):
304
+ op, main_child = op_child
305
+ feature_list = list(features)
306
+ if op.is_vectorizable:
307
+ if op.is_unary:
308
+ group = FeatureGroup(op, main_column=None, children=feature_list)
309
+ else:
310
+ group = FeatureGroup(op, main_column=main_child, children=feature_list)
311
+ grouped_features.append(group)
312
+ else:
313
+ grouped_features.extend(feature_list)
314
+ return grouped_features
296
315
 
297
- def to_formula(self, **kwargs):
298
- return str(self.get_columns(**kwargs)[0])
316
+ def delete_data(self):
317
+ self.data = None
318
+ if self.main_column_node:
319
+ self.main_column_node.delete_data()
320
+ for child in self.children:
321
+ child.delete_data()
@@ -2921,6 +2921,7 @@ class FeaturesEnricher(TransformerMixin):
2921
2921
  if feature_meta is None:
2922
2922
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
2923
2923
  continue
2924
+ description["shap"] = feature_meta.shap_value
2924
2925
  description["Sources"] = feature_meta.data_source\
2925
2926
  .replace("AutoFE: features from ", "")\
2926
2927
  .replace("AutoFE: feature from ", "")
@@ -2940,6 +2941,8 @@ class FeaturesEnricher(TransformerMixin):
2940
2941
 
2941
2942
  descriptions_df = pd.DataFrame(descriptions)
2942
2943
  descriptions_df.fillna("", inplace=True)
2944
+ descriptions_df.sort_values(by="shap", ascending=False, inplace=True)
2945
+ descriptions_df.drop(columns="shap", inplace=True)
2943
2946
  return descriptions_df
2944
2947
  except Exception:
2945
2948
  self.logger.exception("Failed to generate AutoFE features description")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.231a2
3
+ Version: 1.1.232a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,7 +2,7 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=7z9zbVvd1_MiufmoZlCwEHwQ25Q2DX_0g9PFcSMlqMY,49764
4
4
  upgini/errors.py,sha256=BqpvfhW2jJW5fa5KXj0alhXatGl-WK4xTl309-QNLp8,959
5
- upgini/features_enricher.py,sha256=vPol2Oi_Mm-4F_iQBA9L_bm_bWdv_SU0Jmu9sVJK9YM,158650
5
+ upgini/features_enricher.py,sha256=2B9rk_8QNMV7o1khbgZX8A1T6vJqyfki4F4UAYoR0po,158857
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=HzUSZudCdISJGUqHC1gAT1v_x1n_dIFVDJW4z3Q7DCs,41204
8
8
  upgini/metadata.py,sha256=FZ5CQluLLWrfrBVThSIes1SW6wcs7n50aNZwzYnHiF0,9584
@@ -15,7 +15,7 @@ upgini/ads_management/ads_manager.py,sha256=O6Pcl_y5e_ULfQ-xmGGn_qBP4z7EtV7TP9et
15
15
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  upgini/autofe/all_operands.py,sha256=du44N6ISWe3ikb0y9ZzSOHNbLiyEYrJPwoBo0Z6xp2s,1487
17
17
  upgini/autofe/binary.py,sha256=f8LQqZi9zyaMUAv-jASMmWNA_vT05ncYCjZq0qx3USs,3972
18
- upgini/autofe/feature.py,sha256=cElNcLfw9BeBVUkkaFzWWXrnyWNUCXiw0FGqsitorbE,10133
18
+ upgini/autofe/feature.py,sha256=iDB_cL49w7AYl-96AkVqWBynrE_ZqK0fxgTfuJJoruA,11847
19
19
  upgini/autofe/groupby.py,sha256=iXRfOmOc84ooSzRhsh9GmmG7rTafX0-ekXko8s9Qs68,3089
20
20
  upgini/autofe/operand.py,sha256=8WqEoSIA5rEWCK1xuC303E4NW5a72GZ5jUMAEj4skII,2291
21
21
  upgini/autofe/unary.py,sha256=7TBe7PCt7l_XQEqu_G5g_TC2cW3tppL7uPDcX8xsqz0,2731
@@ -53,8 +53,8 @@ upgini/utils/sklearn_ext.py,sha256=IMx2La70AXAggApVpT7sMEjWqVWon5AMZt4MARDsIMQ,4
53
53
  upgini/utils/target_utils.py,sha256=cu52icjhDIPpEStHYMXrD2hIl9gzvfnxZr0Ra5osV0k,1616
54
54
  upgini/utils/track_info.py,sha256=DVNVZmXUb4f25DSPEuUNEFx49hNEBfmuY9iSW5jkMnI,5708
55
55
  upgini/utils/warning_counter.py,sha256=vnmdFo5-7GBkU2bK9h_uC0K0Y_wtfcYstxOdeRfacO0,228
56
- upgini-1.1.231a2.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
57
- upgini-1.1.231a2.dist-info/METADATA,sha256=RRZJq05KPtLkixg7feODcdrsAdwioqWdEM-D9aJnffY,48400
58
- upgini-1.1.231a2.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
59
- upgini-1.1.231a2.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
60
- upgini-1.1.231a2.dist-info/RECORD,,
56
+ upgini-1.1.232a2.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
57
+ upgini-1.1.232a2.dist-info/METADATA,sha256=s6aUWidVUESHbFanlUtLlmWr4izBNGLTeQj1O1bH82A,48400
58
+ upgini-1.1.232a2.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
59
+ upgini-1.1.232a2.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
60
+ upgini-1.1.232a2.dist-info/RECORD,,