upgini 1.2.98a3922.dev3__py3-none-any.whl → 1.2.99a3922.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/feature.py +27 -10
- upgini/autofe/operator.py +8 -1
- upgini/autofe/unary.py +22 -1
- upgini/autofe/vector.py +1 -1
- upgini/features_enricher.py +1 -1
- upgini/metrics.py +6 -6
- {upgini-1.2.98a3922.dev3.dist-info → upgini-1.2.99a3922.dev4.dist-info}/METADATA +1 -1
- {upgini-1.2.98a3922.dev3.dist-info → upgini-1.2.99a3922.dev4.dist-info}/RECORD +11 -11
- {upgini-1.2.98a3922.dev3.dist-info → upgini-1.2.99a3922.dev4.dist-info}/WHEEL +0 -0
- {upgini-1.2.98a3922.dev3.dist-info → upgini-1.2.99a3922.dev4.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.99a3922.dev4"
|
upgini/autofe/feature.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import hashlib
|
2
2
|
import itertools
|
3
|
+
import logging
|
3
4
|
from typing import Dict, List, Optional, Set, Tuple, Union
|
4
5
|
|
5
6
|
import numpy as np
|
@@ -18,10 +19,7 @@ class Column:
|
|
18
19
|
self.data = data
|
19
20
|
self.calculate_all = calculate_all
|
20
21
|
|
21
|
-
def
|
22
|
-
return self.name
|
23
|
-
|
24
|
-
def set_op_params(self, params: Dict[str, str]) -> "Column":
|
22
|
+
def set_op_params(self, params: Dict[str, str], **kwargs) -> "Column":
|
25
23
|
return self
|
26
24
|
|
27
25
|
def get_op_params(self, **kwargs):
|
@@ -37,8 +35,21 @@ class Column:
|
|
37
35
|
def get_column_nodes(self) -> List["Column"]:
|
38
36
|
return [self]
|
39
37
|
|
40
|
-
def get_columns(self, **kwargs)
|
41
|
-
|
38
|
+
def get_columns(self, unhash=False, **kwargs):
|
39
|
+
name = self.name
|
40
|
+
return [self._unhash(name) if unhash else name]
|
41
|
+
|
42
|
+
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
43
|
+
return self.get_columns(**kwargs)[0]
|
44
|
+
|
45
|
+
def _unhash(self, feature_name: str) -> str:
|
46
|
+
last_component_idx = feature_name.rfind("_")
|
47
|
+
if not feature_name.startswith("f_"):
|
48
|
+
return feature_name # etalon feature
|
49
|
+
elif last_component_idx == 1:
|
50
|
+
return feature_name[2:] # fully hashed name, cannot unhash
|
51
|
+
else:
|
52
|
+
return feature_name[2:last_component_idx]
|
42
53
|
|
43
54
|
@property
|
44
55
|
def children(self) -> List[Union["Feature", "Column"]]:
|
@@ -81,7 +92,7 @@ class Feature:
|
|
81
92
|
self.cached_display_name = cached_display_name
|
82
93
|
self.alias = alias
|
83
94
|
|
84
|
-
def set_op_params(self, params: Optional[Dict[str, str]]) -> "Feature":
|
95
|
+
def set_op_params(self, params: Optional[Dict[str, str]], **kwargs) -> "Feature":
|
85
96
|
obj_dict = pydantic_dump_method(self.op)().copy()
|
86
97
|
obj_dict.update(params or {})
|
87
98
|
self.op = pydantic_parse_method(self.op.__class__)(obj_dict)
|
@@ -89,13 +100,13 @@ class Feature:
|
|
89
100
|
|
90
101
|
for child in self.children:
|
91
102
|
child_params = {
|
92
|
-
k[len(child.get_display_name()) + 1 :]: v
|
103
|
+
k[len(child.get_display_name(**kwargs)) + 1 :]: v
|
93
104
|
for k, v in params.items()
|
94
|
-
if k.startswith(child.get_display_name())
|
105
|
+
if k.startswith(child.get_display_name(**kwargs))
|
95
106
|
}
|
96
107
|
if not child_params:
|
97
108
|
child_params = params
|
98
|
-
child.set_op_params(child_params)
|
109
|
+
child.set_op_params(child_params, **kwargs)
|
99
110
|
return self
|
100
111
|
|
101
112
|
def get_op_params(self, **kwargs) -> Dict[str, str]:
|
@@ -341,6 +352,12 @@ class Feature:
|
|
341
352
|
base_features.reverse()
|
342
353
|
return Feature(op, base_features)
|
343
354
|
|
355
|
+
def set_logger(self, logger: logging.Logger):
|
356
|
+
self.op.set_logger(logger)
|
357
|
+
for child in self.children:
|
358
|
+
child.set_logger(logger)
|
359
|
+
return self
|
360
|
+
|
344
361
|
|
345
362
|
class FeatureGroup:
|
346
363
|
def __init__(
|
upgini/autofe/operator.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
import abc
|
2
|
+
import logging
|
2
3
|
from typing import Dict, List, Optional, Tuple, Union
|
3
4
|
|
4
5
|
import numpy as np
|
5
6
|
import pandas as pd
|
6
|
-
from pydantic import BaseModel
|
7
|
+
from pydantic import BaseModel, PrivateAttr
|
7
8
|
|
8
9
|
|
9
10
|
class OperatorRegistry(type(BaseModel)):
|
@@ -64,6 +65,8 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
|
|
64
65
|
is_distribution_dependent: bool = False
|
65
66
|
params: Optional[Dict[str, str]] = None
|
66
67
|
|
68
|
+
_logger: logging.Logger = PrivateAttr(default=logging.getLogger(__name__))
|
69
|
+
|
67
70
|
def set_params(self, params: Dict[str, str]):
|
68
71
|
self.params = params
|
69
72
|
return self
|
@@ -79,6 +82,10 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
|
|
79
82
|
def get_hash_component(self) -> str:
|
80
83
|
return self.to_formula()
|
81
84
|
|
85
|
+
def set_logger(self, logger: logging.Logger):
|
86
|
+
self._logger = logger
|
87
|
+
return self
|
88
|
+
|
82
89
|
|
83
90
|
class ParametrizedOperator(Operator, abc.ABC):
|
84
91
|
|
upgini/autofe/unary.py
CHANGED
@@ -3,7 +3,7 @@ from typing import Dict, List, Optional
|
|
3
3
|
import numpy as np
|
4
4
|
import pandas as pd
|
5
5
|
|
6
|
-
from upgini.autofe.operator import PandasOperator, VectorizableMixin
|
6
|
+
from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
|
7
7
|
from upgini.autofe.utils import pydantic_validator
|
8
8
|
|
9
9
|
|
@@ -198,3 +198,24 @@ class Cluster(PandasOperator):
|
|
198
198
|
input_type: Optional[str] = "vector"
|
199
199
|
output_type: Optional[str] = "category"
|
200
200
|
is_categorical: bool = True
|
201
|
+
|
202
|
+
|
203
|
+
class OutlierDistance(PandasOperator, ParametrizedOperator):
|
204
|
+
name: str = "outlier_dist"
|
205
|
+
is_unary: bool = True
|
206
|
+
input_type: Optional[str] = "vector"
|
207
|
+
output_type: Optional[str] = "float"
|
208
|
+
class_value: Optional[str] = None
|
209
|
+
|
210
|
+
def to_formula(self) -> str:
|
211
|
+
return f"outlier_dist_{self.class_value if self.class_value is not None else 'all'}"
|
212
|
+
|
213
|
+
@classmethod
|
214
|
+
def from_formula(cls, formula: str) -> Optional["OutlierDistance"]:
|
215
|
+
if formula == "outlier_dist":
|
216
|
+
return cls()
|
217
|
+
|
218
|
+
if formula.startswith("outlier_dist_"):
|
219
|
+
class_value = formula.split("_")[-1]
|
220
|
+
return cls(class_value=None if class_value == "all" else class_value)
|
221
|
+
return None
|
upgini/autofe/vector.py
CHANGED
upgini/features_enricher.py
CHANGED
@@ -4174,7 +4174,7 @@ if response.status_code == 200:
|
|
4174
4174
|
|
4175
4175
|
description = {}
|
4176
4176
|
|
4177
|
-
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
4177
|
+
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True, unhash=True))
|
4178
4178
|
if feature_meta is None:
|
4179
4179
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
4180
4180
|
continue
|
upgini/metrics.py
CHANGED
@@ -399,14 +399,14 @@ class EstimatorWrapper:
|
|
399
399
|
self.converted_to_str.append(c)
|
400
400
|
elif c in self.cat_features:
|
401
401
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
402
|
-
x[c] = x[c].astype(
|
402
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
403
403
|
self.converted_to_int.append(c)
|
404
404
|
elif x[c].dtype == "category" and is_integer_dtype(x[c].cat.categories):
|
405
405
|
self.logger.info(
|
406
406
|
f"Convert categorical feature {c} with integer categories"
|
407
407
|
" to int64 and remove from cat_features"
|
408
408
|
)
|
409
|
-
x[c] = x[c].astype(
|
409
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
410
410
|
self.converted_to_int.append(c)
|
411
411
|
self.cat_features.remove(c)
|
412
412
|
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
@@ -419,7 +419,7 @@ class EstimatorWrapper:
|
|
419
419
|
else:
|
420
420
|
if x[c].dtype == "bool" or (x[c].dtype == "category" and x[c].cat.categories.dtype == "bool"):
|
421
421
|
self.logger.info(f"Convert bool feature {c} to int64")
|
422
|
-
x[c] = x[c].astype(
|
422
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
423
423
|
self.converted_to_int.append(c)
|
424
424
|
elif not is_valid_numeric_array_data(x[c]) and not is_numeric_dtype(x[c]):
|
425
425
|
try:
|
@@ -442,7 +442,7 @@ class EstimatorWrapper:
|
|
442
442
|
if self.converted_to_int:
|
443
443
|
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
444
444
|
for c in self.converted_to_int:
|
445
|
-
x[c] = x[c].astype(
|
445
|
+
x[c] = x[c].astype(pd.Int64Dtype())
|
446
446
|
|
447
447
|
if self.converted_to_str:
|
448
448
|
self.logger.info(f"Convert to str features on calculate metrics: {self.converted_to_str}")
|
@@ -896,7 +896,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
896
896
|
x[c] = x[c].astype("category")
|
897
897
|
|
898
898
|
for c in x.columns:
|
899
|
-
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
899
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
|
900
900
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
901
901
|
self.dropped_features.append(c)
|
902
902
|
x = x.drop(columns=c, errors="ignore")
|
@@ -987,7 +987,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
987
987
|
x[c] = x[c].astype("category")
|
988
988
|
params["cat_features"] = self.cat_features
|
989
989
|
for c in x.columns:
|
990
|
-
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
990
|
+
if x[c].dtype not in ["category", "int64", "float64", "bool", "Int64"]:
|
991
991
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
992
992
|
self.dropped_features.append(c)
|
993
993
|
x = x.drop(columns=c, errors="ignore")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.99a3922.dev4
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=5Lrxh5wP8aiUGT1GPRS8K7nPnEINmj_I5a_XBymupWQ,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=KSOEzO29nY79RIW0hdbf1qXQGxa3itKZ0PkcwVPPf9U,219954
|
7
7
|
upgini/http.py,sha256=DNcoS7qdxG0mOJn6I8r6O5I6XdIJTdzDzW3hkz3NgG4,45443
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=gXr2aiw5j9QBWBo1hZp40Is679hef5q8MrT6LJfjsBk,45661
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -16,12 +16,12 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
17
|
upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
|
18
18
|
upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
|
19
|
-
upgini/autofe/feature.py,sha256=
|
19
|
+
upgini/autofe/feature.py,sha256=71IQXztYdG2nygVJ4AZ4mOsx5w8PN239rZguKy_4lnE,16250
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
21
|
-
upgini/autofe/operator.py,sha256=
|
22
|
-
upgini/autofe/unary.py,sha256=
|
21
|
+
upgini/autofe/operator.py,sha256=WpMd3C-7FpiNXhVDs3MQy7Benz9B6iq6jvXohnCms9c,5178
|
22
|
+
upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
|
23
23
|
upgini/autofe/utils.py,sha256=dYrtyAM8Vcc_R8u4dNo54IsGrHKagTHDJTKhGho0bRg,2967
|
24
|
-
upgini/autofe/vector.py,sha256=
|
24
|
+
upgini/autofe/vector.py,sha256=9T7MEUK0SavXIJy0c9Kvu5qTcMtt3fzvdRDBDxcI0JA,1277
|
25
25
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
26
26
|
upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
|
27
27
|
upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
|
@@ -71,7 +71,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.99a3922.dev4.dist-info/METADATA,sha256=wYgu44FVyY6Bfof83_UJ1tWMWxrKTUoY_m1Q0QHDqJ8,49538
|
75
|
+
upgini-1.2.99a3922.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
76
|
+
upgini-1.2.99a3922.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.99a3922.dev4.dist-info/RECORD,,
|
File without changes
|
File without changes
|