upgini 1.2.98a3922.dev3__py3-none-any.whl → 1.2.99a3922.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.98a3922.dev3"
1
+ __version__ = "1.2.99a3922.dev4"
upgini/autofe/feature.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import hashlib
2
2
  import itertools
3
+ import logging
3
4
  from typing import Dict, List, Optional, Set, Tuple, Union
4
5
 
5
6
  import numpy as np
@@ -18,10 +19,7 @@ class Column:
18
19
  self.data = data
19
20
  self.calculate_all = calculate_all
20
21
 
21
- def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
22
- return self.name
23
-
24
- def set_op_params(self, params: Dict[str, str]) -> "Column":
22
+ def set_op_params(self, params: Dict[str, str], **kwargs) -> "Column":
25
23
  return self
26
24
 
27
25
  def get_op_params(self, **kwargs):
@@ -37,8 +35,21 @@ class Column:
37
35
  def get_column_nodes(self) -> List["Column"]:
38
36
  return [self]
39
37
 
40
- def get_columns(self, **kwargs) -> List[str]:
41
- return [self.name]
38
+ def get_columns(self, unhash=False, **kwargs):
39
+ name = self.name
40
+ return [self._unhash(name) if unhash else name]
41
+
42
+ def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
43
+ return self.get_columns(**kwargs)[0]
44
+
45
+ def _unhash(self, feature_name: str) -> str:
46
+ last_component_idx = feature_name.rfind("_")
47
+ if not feature_name.startswith("f_"):
48
+ return feature_name # etalon feature
49
+ elif last_component_idx == 1:
50
+ return feature_name[2:] # fully hashed name, cannot unhash
51
+ else:
52
+ return feature_name[2:last_component_idx]
42
53
 
43
54
  @property
44
55
  def children(self) -> List[Union["Feature", "Column"]]:
@@ -81,7 +92,7 @@ class Feature:
81
92
  self.cached_display_name = cached_display_name
82
93
  self.alias = alias
83
94
 
84
- def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
95
+ def set_op_params(self, params: Optional[Dict[str, str]], **kwargs) -> "Feature":
85
96
  obj_dict = pydantic_dump_method(self.op)().copy()
86
97
  obj_dict.update(params or {})
87
98
  self.op = pydantic_parse_method(self.op.__class__)(obj_dict)
@@ -89,13 +100,13 @@ class Feature:
89
100
 
90
101
  for child in self.children:
91
102
  child_params = {
92
- k[len(child.get_display_name()) + 1 :]: v
103
+ k[len(child.get_display_name(**kwargs)) + 1 :]: v
93
104
  for k, v in params.items()
94
- if k.startswith(child.get_display_name())
105
+ if k.startswith(child.get_display_name(**kwargs))
95
106
  }
96
107
  if not child_params:
97
108
  child_params = params
98
- child.set_op_params(child_params)
109
+ child.set_op_params(child_params, **kwargs)
99
110
  return self
100
111
 
101
112
  def get_op_params(self, **kwargs) -> Dict[str, str]:
@@ -341,6 +352,12 @@ class Feature:
341
352
  base_features.reverse()
342
353
  return Feature(op, base_features)
343
354
 
355
+ def set_logger(self, logger: logging.Logger):
356
+ self.op.set_logger(logger)
357
+ for child in self.children:
358
+ child.set_logger(logger)
359
+ return self
360
+
344
361
 
345
362
  class FeatureGroup:
346
363
  def __init__(
upgini/autofe/operator.py CHANGED
@@ -1,9 +1,10 @@
1
1
  import abc
2
+ import logging
2
3
  from typing import Dict, List, Optional, Tuple, Union
3
4
 
4
5
  import numpy as np
5
6
  import pandas as pd
6
- from pydantic import BaseModel
7
+ from pydantic import BaseModel, PrivateAttr
7
8
 
8
9
 
9
10
  class OperatorRegistry(type(BaseModel)):
@@ -64,6 +65,8 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
64
65
  is_distribution_dependent: bool = False
65
66
  params: Optional[Dict[str, str]] = None
66
67
 
68
+ _logger: logging.Logger = PrivateAttr(default=logging.getLogger(__name__))
69
+
67
70
  def set_params(self, params: Dict[str, str]):
68
71
  self.params = params
69
72
  return self
@@ -79,6 +82,10 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
79
82
  def get_hash_component(self) -> str:
80
83
  return self.to_formula()
81
84
 
85
+ def set_logger(self, logger: logging.Logger):
86
+ self._logger = logger
87
+ return self
88
+
82
89
 
83
90
  class ParametrizedOperator(Operator, abc.ABC):
84
91
 
upgini/autofe/unary.py CHANGED
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional
3
3
  import numpy as np
4
4
  import pandas as pd
5
5
 
6
- from upgini.autofe.operator import PandasOperator, VectorizableMixin
6
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
7
7
  from upgini.autofe.utils import pydantic_validator
8
8
 
9
9
 
@@ -198,3 +198,24 @@ class Cluster(PandasOperator):
198
198
  input_type: Optional[str] = "vector"
199
199
  output_type: Optional[str] = "category"
200
200
  is_categorical: bool = True
201
+
202
+
203
+ class OutlierDistance(PandasOperator, ParametrizedOperator):
204
+ name: str = "outlier_dist"
205
+ is_unary: bool = True
206
+ input_type: Optional[str] = "vector"
207
+ output_type: Optional[str] = "float"
208
+ class_value: Optional[str] = None
209
+
210
+ def to_formula(self) -> str:
211
+ return f"outlier_dist_{self.class_value if self.class_value is not None else 'all'}"
212
+
213
+ @classmethod
214
+ def from_formula(cls, formula: str) -> Optional["OutlierDistance"]:
215
+ if formula == "outlier_dist":
216
+ return cls()
217
+
218
+ if formula.startswith("outlier_dist_"):
219
+ class_value = formula.split("_")[-1]
220
+ return cls(class_value=None if class_value == "all" else class_value)
221
+ return None
upgini/autofe/vector.py CHANGED
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Optional
1
+ from typing import List, Optional
2
2
 
3
3
  import pandas as pd
4
4
 
@@ -4174,7 +4174,7 @@ if response.status_code == 200:
4174
4174
 
4175
4175
  description = {}
4176
4176
 
4177
- feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
4177
+ feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True, unhash=True))
4178
4178
  if feature_meta is None:
4179
4179
  self.logger.warning(f"Feature meta for display index {m.display_index} not found")
4180
4180
  continue
upgini/metrics.py CHANGED
@@ -399,14 +399,14 @@ class EstimatorWrapper:
399
399
  self.converted_to_str.append(c)
400
400
  elif c in self.cat_features:
401
401
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
402
- x[c] = x[c].astype(np.int64)
402
+ x[c] = x[c].astype(pd.Int64Dtype())
403
403
  self.converted_to_int.append(c)
404
404
  elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
405
405
  self.logger.info(
406
406
  f"Convert categorical feature {c} with integer categories"
407
407
  " to int64 and remove from cat_features"
408
408
  )
409
- x[c] = x[c].astype(np.int64)
409
+ x[c] = x[c].astype(pd.Int64Dtype())
410
410
  self.converted_to_int.append(c)
411
411
  self.cat_features.remove(c)
412
412
  elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
@@ -419,7 +419,7 @@ class EstimatorWrapper:
419
419
  else:
420
420
  if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
421
421
  self.logger.info(f"Convert bool feature {c} to int64")
422
- x[c] = x[c].astype(np.int64)
422
+ x[c] = x[c].astype(pd.Int64Dtype())
423
423
  self.converted_to_int.append(c)
424
424
  elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
425
425
  try:
@@ -442,7 +442,7 @@ class EstimatorWrapper:
442
442
  if self.converted_to_int:
443
443
  self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
444
444
  for c in self.converted_to_int:
445
- x[c] = x[c].astype(np.int64)
445
+ x[c] = x[c].astype(pd.Int64Dtype())
446
446
 
447
447
  if self.converted_to_str:
448
448
  self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
@@ -896,7 +896,7 @@ class LightGBMWrapper(EstimatorWrapper):
896
896
  x[c] = x[c].astype("category")
897
897
 
898
898
  for c in x.columns:
899
- if x[c].dtype not in ["category", "int64", "float64", "bool"]:
899
+ if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
900
900
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
901
901
  self.dropped_features.append(c)
902
902
  x = x.drop(columns=c, errors="ignore")
@@ -987,7 +987,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
987
987
  x[c] = x[c].astype("category")
988
988
  params["cat_features"] = self.cat_features
989
989
  for c in x.columns:
990
- if x[c].dtype not in ["category", "int64", "float64", "bool"]:
990
+ if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
991
991
  self.logger.warning(f"Feature {c} is not numeric and will be dropped")
992
992
  self.dropped_features.append(c)
993
993
  x = x.drop(columns=c, errors="ignore")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.98a3922.dev3
3
+ Version: 1.2.99a3922.dev4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=c20ALjeM25Bh-ipz7uc8Eb_tWD5utgqiELwRRlqcRlw,33
1
+ upgini/__about__.py,sha256=5Lrxh5wP8aiUGT1GPRS8K7nPnEINmj_I5a_XBymupWQ,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=4rKoV-3jM876Fk0fM4XlnW3fLwXvk1KN2ymcwlAfPm0,219941
6
+ upgini/features_enricher.py,sha256=KSOEzO29nY79RIW0hdbf1qXQGxa3itKZ0PkcwVPPf9U,219954
7
7
  upgini/http.py,sha256=DNcoS7qdxG0mOJn6I8r6O5I6XdIJTdzDzW3hkz3NgG4,45443
8
8
  upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
9
- upgini/metrics.py,sha256=UbKEsHB7XDzoyGNqDx846zbh1t65GpqdnnhViccdoKU,45615
9
+ upgini/metrics.py,sha256=gXr2aiw5j9QBWBo1hZp40Is679hef5q8MrT6LJfjsBk,45661
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -16,12 +16,12 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
17
17
  upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
18
18
  upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
19
- upgini/autofe/feature.py,sha256=cu4xXjzVVF13ZV4RxuTrysK2qCfezlRCMOzCKRo1rNs,15558
19
+ upgini/autofe/feature.py,sha256=71IQXztYdG2nygVJ4AZ4mOsx5w8PN239rZguKy_4lnE,16250
20
20
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
21
- upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
22
- upgini/autofe/unary.py,sha256=N76Pehn-hO8FWlSdqJ2Wm-yoU1MSR7m6yb2GWYBcumU,5933
21
+ upgini/autofe/operator.py,sha256=WpMd3C-7FpiNXhVDs3MQy7Benz9B6iq6jvXohnCms9c,5178
22
+ upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
23
23
  upgini/autofe/utils.py,sha256=dYrtyAM8Vcc_R8u4dNo54IsGrHKagTHDJTKhGho0bRg,2967
24
- upgini/autofe/vector.py,sha256=NBvRLXVSQf8AU5WI-rXBlO2lfs-skX_XD0KaxkfBFW8,1283
24
+ upgini/autofe/vector.py,sha256=9T7MEUK0SavXIJy0c9Kvu5qTcMtt3fzvdRDBDxcI0JA,1277
25
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
26
  upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
27
27
  upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.98a3922.dev3.dist-info/METADATA,sha256=N0PxLZz_XaDEyH77cUCwjKE3ocLXAOo6n5Cy_1xYb8w,49538
75
- upgini-1.2.98a3922.dev3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
76
- upgini-1.2.98a3922.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.98a3922.dev3.dist-info/RECORD,,
74
+ upgini-1.2.99a3922.dev4.dist-info/METADATA,sha256=wYgu44FVyY6Bfof83_UJ1tWMWxrKTUoY_m1Q0QHDqJ8,49538
75
+ upgini-1.2.99a3922.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
76
+ upgini-1.2.99a3922.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.99a3922.dev4.dist-info/RECORD,,