upgini 1.2.66a3818.dev1__py3-none-any.whl → 1.2.68a3818.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +15 -21
- upgini/autofe/timeseries/cross.py +15 -7
- upgini/autofe/timeseries/roll.py +2 -7
- upgini/autofe/utils.py +83 -0
- upgini/features_enricher.py +152 -153
- upgini/resource_bundle/strings.properties +1 -0
- upgini/search_task.py +7 -1
- upgini/utils/feature_info.py +22 -11
- upgini/utils/mstats.py +1 -1
- upgini/utils/sort.py +8 -2
- {upgini-1.2.66a3818.dev1.dist-info → upgini-1.2.68a3818.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.66a3818.dev1.dist-info → upgini-1.2.68a3818.dev1.dist-info}/RECORD +15 -14
- {upgini-1.2.66a3818.dev1.dist-info → upgini-1.2.68a3818.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.66a3818.dev1.dist-info → upgini-1.2.68a3818.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.68a3818.dev1"
|
upgini/autofe/date.py
CHANGED
|
@@ -8,6 +8,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
10
|
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
11
|
+
from upgini.autofe.utils import pydantic_validator
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def get_pydantic_version():
|
|
@@ -209,6 +210,14 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
|
209
210
|
|
|
210
211
|
return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
|
|
211
212
|
|
|
213
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
|
214
|
+
res = super().get_params()
|
|
215
|
+
if self.lower_bound is not None:
|
|
216
|
+
res["lower_bound"] = str(self.lower_bound)
|
|
217
|
+
if self.upper_bound is not None:
|
|
218
|
+
res["upper_bound"] = str(self.upper_bound)
|
|
219
|
+
return res
|
|
220
|
+
|
|
212
221
|
def _agg(self, x):
|
|
213
222
|
x = x[
|
|
214
223
|
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
@@ -269,32 +278,17 @@ class DatePercentile(DatePercentileBase):
|
|
|
269
278
|
{
|
|
270
279
|
"zero_month": self.zero_month,
|
|
271
280
|
"zero_year": self.zero_year,
|
|
272
|
-
"zero_bounds": self.zero_bounds,
|
|
281
|
+
"zero_bounds": json.dumps(self.zero_bounds),
|
|
273
282
|
"step": self.step,
|
|
274
283
|
}
|
|
275
284
|
)
|
|
276
285
|
return res
|
|
277
286
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
@field_validator("zero_bounds", mode="before")
|
|
284
|
-
def parse_zero_bounds(cls, value):
|
|
285
|
-
if isinstance(value, str):
|
|
286
|
-
return json.loads(value)
|
|
287
|
-
return value
|
|
288
|
-
|
|
289
|
-
else:
|
|
290
|
-
# Use @validator for Pydantic 1.x
|
|
291
|
-
from pydantic import validator
|
|
292
|
-
|
|
293
|
-
@validator("zero_bounds", pre=True)
|
|
294
|
-
def parse_zero_bounds(cls, value):
|
|
295
|
-
if isinstance(value, str):
|
|
296
|
-
return json.loads(value)
|
|
297
|
-
return value
|
|
287
|
+
@pydantic_validator("zero_bounds", mode="before")
|
|
288
|
+
def parse_zero_bounds(cls, value):
|
|
289
|
+
if isinstance(value, str):
|
|
290
|
+
return json.loads(value)
|
|
291
|
+
return value
|
|
298
292
|
|
|
299
293
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
300
294
|
months = date_col.dt.month
|
|
@@ -1,16 +1,13 @@
|
|
|
1
|
+
import json
|
|
1
2
|
from typing import Dict, List, Optional
|
|
2
3
|
|
|
3
4
|
import numpy as np
|
|
4
5
|
import pandas as pd
|
|
5
6
|
|
|
6
|
-
try:
|
|
7
|
-
from pydantic import field_validator as validator # V2
|
|
8
|
-
except ImportError:
|
|
9
|
-
from pydantic import validator # V1
|
|
10
|
-
|
|
11
7
|
from upgini.autofe.all_operators import find_op
|
|
12
8
|
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
13
9
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
10
|
+
from upgini.autofe.utils import pydantic_validator
|
|
14
11
|
|
|
15
12
|
|
|
16
13
|
class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
@@ -20,13 +17,24 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
|
20
17
|
left_descriptor: List[str] = []
|
|
21
18
|
right_descriptor: List[str] = []
|
|
22
19
|
|
|
23
|
-
@
|
|
24
|
-
@classmethod
|
|
20
|
+
@pydantic_validator("descriptor_indices")
|
|
25
21
|
def validate_descriptor_indices(cls, v):
|
|
26
22
|
if not v:
|
|
27
23
|
raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
|
|
28
24
|
return v
|
|
29
25
|
|
|
26
|
+
@pydantic_validator("left_descriptor", "right_descriptor", mode="before")
|
|
27
|
+
def parse_descriptors(cls, v):
|
|
28
|
+
if isinstance(v, str):
|
|
29
|
+
return json.loads(v)
|
|
30
|
+
return v
|
|
31
|
+
|
|
32
|
+
@pydantic_validator("interaction_op", mode="before")
|
|
33
|
+
def validate_interaction_op(cls, v):
|
|
34
|
+
if isinstance(v, str):
|
|
35
|
+
return find_op(v)
|
|
36
|
+
return v
|
|
37
|
+
|
|
30
38
|
def __init__(self, **data):
|
|
31
39
|
super().__init__(**data)
|
|
32
40
|
indices = self.descriptor_indices
|
upgini/autofe/timeseries/roll.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from upgini.autofe.operator import ParametrizedOperator
|
|
5
5
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
+
from upgini.autofe.utils import pydantic_validator
|
|
6
7
|
|
|
7
8
|
# Roll aggregation functions
|
|
8
9
|
roll_aggregations = {
|
|
@@ -12,19 +13,13 @@ roll_aggregations = {
|
|
|
12
13
|
"iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
|
|
13
14
|
}
|
|
14
15
|
|
|
15
|
-
try:
|
|
16
|
-
from pydantic import field_validator as validator # V2
|
|
17
|
-
except ImportError:
|
|
18
|
-
from pydantic import validator # V1
|
|
19
|
-
|
|
20
16
|
|
|
21
17
|
class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
22
18
|
aggregation: str
|
|
23
19
|
window_size: int = 1
|
|
24
20
|
window_unit: str = "D"
|
|
25
21
|
|
|
26
|
-
@
|
|
27
|
-
@classmethod
|
|
22
|
+
@pydantic_validator("window_unit")
|
|
28
23
|
def validate_window_unit(cls, v: str) -> str:
|
|
29
24
|
try:
|
|
30
25
|
pd.tseries.frequencies.to_offset(v)
|
upgini/autofe/utils.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility functions for autofe module.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import functools
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_pydantic_version():
|
|
10
|
+
"""
|
|
11
|
+
Get the major version of pydantic.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
int: Major version number (1 or 2)
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
from pydantic import __version__ as pydantic_version
|
|
18
|
+
|
|
19
|
+
major_version = int(pydantic_version.split(".")[0])
|
|
20
|
+
return major_version
|
|
21
|
+
except (ImportError, ValueError):
|
|
22
|
+
# Default to version 1 if unable to determine
|
|
23
|
+
return 1
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
|
|
27
|
+
"""
|
|
28
|
+
A decorator that applies the appropriate Pydantic validator based on the installed version.
|
|
29
|
+
|
|
30
|
+
This decorator handles the differences between Pydantic v1 and v2 validator syntax,
|
|
31
|
+
making it easier to write code that works with both versions.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
field_name (str): The name of the field to validate
|
|
35
|
+
mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
|
|
36
|
+
**kwargs: Additional arguments to pass to the validator
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Callable: A decorator that can be applied to validator methods
|
|
40
|
+
|
|
41
|
+
Example:
|
|
42
|
+
```python
|
|
43
|
+
class MyModel(BaseModel):
|
|
44
|
+
items: List[int]
|
|
45
|
+
|
|
46
|
+
@pydantic_validator("items")
|
|
47
|
+
def parse_items(cls, value):
|
|
48
|
+
if isinstance(value, str):
|
|
49
|
+
return [int(x) for x in value.split(",")]
|
|
50
|
+
return value
|
|
51
|
+
```
|
|
52
|
+
"""
|
|
53
|
+
pydantic_version = get_pydantic_version()
|
|
54
|
+
|
|
55
|
+
if pydantic_version >= 2:
|
|
56
|
+
# Use field_validator for Pydantic 2.x
|
|
57
|
+
from pydantic import field_validator
|
|
58
|
+
|
|
59
|
+
def decorator(func: Callable) -> Callable:
|
|
60
|
+
@field_validator(field_name, *fields, mode=mode, **kwargs)
|
|
61
|
+
@functools.wraps(func)
|
|
62
|
+
def wrapper(cls, value, **kw):
|
|
63
|
+
return func(cls, value)
|
|
64
|
+
|
|
65
|
+
return wrapper
|
|
66
|
+
|
|
67
|
+
return decorator
|
|
68
|
+
else:
|
|
69
|
+
# Use validator for Pydantic 1.x
|
|
70
|
+
from pydantic import validator
|
|
71
|
+
|
|
72
|
+
# Map mode to Pydantic v1 parameters
|
|
73
|
+
pre = True if mode == "before" else False
|
|
74
|
+
|
|
75
|
+
def decorator(func: Callable) -> Callable:
|
|
76
|
+
@validator(field_name, *fields, pre=pre, **kwargs)
|
|
77
|
+
@functools.wraps(func)
|
|
78
|
+
def wrapper(cls, value, **kw):
|
|
79
|
+
return func(cls, value)
|
|
80
|
+
|
|
81
|
+
return wrapper
|
|
82
|
+
|
|
83
|
+
return decorator
|
upgini/features_enricher.py
CHANGED
|
@@ -308,7 +308,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
308
308
|
self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
|
|
309
309
|
file_metadata = self._search_task.get_file_metadata(trace_id)
|
|
310
310
|
x_columns = [c.originalName or c.name for c in file_metadata.columns]
|
|
311
|
-
|
|
311
|
+
df = pd.DataFrame(columns=x_columns)
|
|
312
|
+
self.__prepare_feature_importances(trace_id, df, silent=True)
|
|
312
313
|
# TODO validate search_keys with search_keys from file_metadata
|
|
313
314
|
print(self.bundle.get("search_by_task_id_finish"))
|
|
314
315
|
self.logger.debug(f"Successfully initialized with search_id: {search_id}")
|
|
@@ -701,6 +702,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
701
702
|
def transform(
|
|
702
703
|
self,
|
|
703
704
|
X: pd.DataFrame,
|
|
705
|
+
y: Optional[pd.Series] = None,
|
|
704
706
|
*args,
|
|
705
707
|
exclude_features_sources: Optional[List[str]] = None,
|
|
706
708
|
keep_input: bool = True,
|
|
@@ -765,6 +767,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
765
767
|
result, _, _ = self.__inner_transform(
|
|
766
768
|
trace_id,
|
|
767
769
|
X,
|
|
770
|
+
y=y,
|
|
768
771
|
exclude_features_sources=exclude_features_sources,
|
|
769
772
|
importance_threshold=importance_threshold,
|
|
770
773
|
max_features=max_features,
|
|
@@ -1087,7 +1090,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1087
1090
|
enriched_shaps = enriched_cv_result.shap_values
|
|
1088
1091
|
|
|
1089
1092
|
if enriched_shaps is not None:
|
|
1090
|
-
self._update_shap_values(trace_id,
|
|
1093
|
+
self._update_shap_values(trace_id, fitting_X, enriched_shaps)
|
|
1091
1094
|
|
|
1092
1095
|
if enriched_metric is None:
|
|
1093
1096
|
self.logger.warning(
|
|
@@ -1255,14 +1258,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1255
1258
|
finally:
|
|
1256
1259
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1257
1260
|
|
|
1258
|
-
def _update_shap_values(self, trace_id: str,
|
|
1261
|
+
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
|
|
1259
1262
|
renaming = self.fit_columns_renaming or {}
|
|
1260
1263
|
new_shaps = {
|
|
1261
1264
|
renaming.get(feature, feature): _round_shap_value(shap)
|
|
1262
1265
|
for feature, shap in new_shaps.items()
|
|
1263
1266
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1264
1267
|
}
|
|
1265
|
-
self.__prepare_feature_importances(trace_id,
|
|
1268
|
+
self.__prepare_feature_importances(trace_id, df, new_shaps)
|
|
1266
1269
|
|
|
1267
1270
|
if self.features_info_display_handle is not None:
|
|
1268
1271
|
try:
|
|
@@ -1681,7 +1684,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1681
1684
|
validated_X,
|
|
1682
1685
|
validated_y,
|
|
1683
1686
|
eval_set,
|
|
1684
|
-
is_demo_dataset,
|
|
1685
1687
|
exclude_features_sources,
|
|
1686
1688
|
trace_id,
|
|
1687
1689
|
progress_bar,
|
|
@@ -1872,158 +1874,147 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1872
1874
|
validated_X: pd.DataFrame,
|
|
1873
1875
|
validated_y: pd.Series,
|
|
1874
1876
|
eval_set: Optional[List[tuple]],
|
|
1875
|
-
is_demo_dataset: bool,
|
|
1876
1877
|
exclude_features_sources: Optional[List[str]],
|
|
1877
1878
|
trace_id: str,
|
|
1878
1879
|
progress_bar: Optional[ProgressBar],
|
|
1879
1880
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1880
1881
|
) -> _SampledDataForMetrics:
|
|
1881
|
-
|
|
1882
|
-
if eval_set is not None:
|
|
1883
|
-
self.logger.info("Transform with eval_set")
|
|
1884
|
-
# concatenate X and eval_set with eval_set_index
|
|
1885
|
-
df = validated_X.copy()
|
|
1886
|
-
df[TARGET] = validated_y
|
|
1887
|
-
df[EVAL_SET_INDEX] = 0
|
|
1888
|
-
for idx, eval_pair in enumerate(eval_set):
|
|
1889
|
-
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1890
|
-
eval_df_with_index = eval_x.copy()
|
|
1891
|
-
eval_df_with_index[TARGET] = eval_y
|
|
1892
|
-
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1893
|
-
df = pd.concat([df, eval_df_with_index])
|
|
1894
|
-
|
|
1895
|
-
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1896
|
-
|
|
1897
|
-
# downsample if need to eval_set threshold
|
|
1898
|
-
num_samples = _num_samples(df)
|
|
1899
|
-
force_downsampling = (
|
|
1900
|
-
not self.disable_force_downsampling
|
|
1901
|
-
and self.columns_for_online_api is not None
|
|
1902
|
-
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1903
|
-
)
|
|
1904
|
-
# TODO: check that system_record_id was added before this step
|
|
1905
|
-
if force_downsampling:
|
|
1906
|
-
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1907
|
-
df = balance_undersample_forced(
|
|
1908
|
-
df=df,
|
|
1909
|
-
target_column=TARGET,
|
|
1910
|
-
id_columns=self.id_columns,
|
|
1911
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1912
|
-
task_type=self.model_task_type,
|
|
1913
|
-
cv_type=self.cv,
|
|
1914
|
-
random_state=self.random_state,
|
|
1915
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1916
|
-
logger=self.logger,
|
|
1917
|
-
bundle=self.bundle,
|
|
1918
|
-
warning_callback=self.__log_warning,
|
|
1919
|
-
)
|
|
1920
|
-
elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1921
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1922
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1882
|
+
has_eval_set = eval_set is not None
|
|
1923
1883
|
|
|
1924
|
-
|
|
1884
|
+
self.logger.info(f"Transform {'with' if has_eval_set else 'without'} eval_set")
|
|
1925
1885
|
|
|
1926
|
-
|
|
1927
|
-
|
|
1886
|
+
# Prepare
|
|
1887
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set)
|
|
1888
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1889
|
+
df = self.__downsample_for_metrics(df)
|
|
1928
1890
|
|
|
1929
|
-
|
|
1930
|
-
trace_id,
|
|
1931
|
-
df,
|
|
1932
|
-
exclude_features_sources=exclude_features_sources,
|
|
1933
|
-
silent_mode=True,
|
|
1934
|
-
metrics_calculation=True,
|
|
1935
|
-
progress_bar=progress_bar,
|
|
1936
|
-
progress_callback=progress_callback,
|
|
1937
|
-
add_fit_system_record_id=True,
|
|
1938
|
-
target_name=tmp_target_name,
|
|
1939
|
-
)
|
|
1940
|
-
if enriched_df is None:
|
|
1941
|
-
return None
|
|
1891
|
+
# Transform
|
|
1942
1892
|
|
|
1943
|
-
|
|
1893
|
+
enriched_df, _, _ = self.__inner_transform(
|
|
1894
|
+
trace_id,
|
|
1895
|
+
X=df.drop(columns=[TARGET]),
|
|
1896
|
+
y=df[TARGET],
|
|
1897
|
+
exclude_features_sources=exclude_features_sources,
|
|
1898
|
+
silent_mode=True,
|
|
1899
|
+
metrics_calculation=True,
|
|
1900
|
+
progress_bar=progress_bar,
|
|
1901
|
+
progress_callback=progress_callback,
|
|
1902
|
+
add_fit_system_record_id=True,
|
|
1903
|
+
)
|
|
1904
|
+
if enriched_df is None:
|
|
1905
|
+
return None
|
|
1944
1906
|
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1907
|
+
x_columns = [
|
|
1908
|
+
c
|
|
1909
|
+
for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
|
|
1910
|
+
if c in enriched_df.columns
|
|
1911
|
+
]
|
|
1950
1912
|
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
enriched_X_columns = enriched_X.columns.tolist()
|
|
1913
|
+
X_sampled, y_sampled, enriched_X = self.__extract_train_data(enriched_df, x_columns)
|
|
1914
|
+
eval_set_sampled_dict = self.__extract_eval_data(
|
|
1915
|
+
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
|
1916
|
+
)
|
|
1956
1917
|
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
|
1962
|
-
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1963
|
-
else:
|
|
1964
|
-
self.logger.info("Transform without eval_set")
|
|
1965
|
-
df = validated_X.copy()
|
|
1918
|
+
# Cache and return results
|
|
1919
|
+
return self.__cache_and_return_results(
|
|
1920
|
+
validated_X, validated_y, eval_set, X_sampled, y_sampled, enriched_X, eval_set_sampled_dict
|
|
1921
|
+
)
|
|
1966
1922
|
|
|
1967
|
-
|
|
1923
|
+
def __combine_train_and_eval_sets(
|
|
1924
|
+
self, validated_X: pd.DataFrame, validated_y: pd.Series, eval_set: Optional[List[tuple]]
|
|
1925
|
+
) -> pd.DataFrame:
|
|
1926
|
+
df = validated_X.copy()
|
|
1927
|
+
df[TARGET] = validated_y
|
|
1928
|
+
if eval_set is None:
|
|
1929
|
+
return df
|
|
1968
1930
|
|
|
1969
|
-
|
|
1931
|
+
df[EVAL_SET_INDEX] = 0
|
|
1970
1932
|
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
)
|
|
1933
|
+
for idx, eval_pair in enumerate(eval_set):
|
|
1934
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1935
|
+
eval_df_with_index = eval_x.copy()
|
|
1936
|
+
eval_df_with_index[TARGET] = eval_y
|
|
1937
|
+
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1938
|
+
df = pd.concat([df, eval_df_with_index])
|
|
1977
1939
|
|
|
1978
|
-
|
|
1979
|
-
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1980
|
-
df = balance_undersample_forced(
|
|
1981
|
-
df=df,
|
|
1982
|
-
target_column=TARGET,
|
|
1983
|
-
id_columns=self.id_columns,
|
|
1984
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1985
|
-
task_type=self.model_task_type,
|
|
1986
|
-
cv_type=self.cv,
|
|
1987
|
-
random_state=self.random_state,
|
|
1988
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1989
|
-
logger=self.logger,
|
|
1990
|
-
bundle=self.bundle,
|
|
1991
|
-
warning_callback=self.__log_warning,
|
|
1992
|
-
)
|
|
1993
|
-
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1994
|
-
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1995
|
-
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1940
|
+
return df
|
|
1996
1941
|
|
|
1997
|
-
|
|
1998
|
-
|
|
1942
|
+
def __downsample_for_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
1943
|
+
num_samples = _num_samples(df)
|
|
1944
|
+
force_downsampling = (
|
|
1945
|
+
not self.disable_force_downsampling
|
|
1946
|
+
and self.columns_for_online_api is not None
|
|
1947
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1948
|
+
)
|
|
1999
1949
|
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
1950
|
+
if force_downsampling:
|
|
1951
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1952
|
+
return balance_undersample_forced(
|
|
1953
|
+
df=df,
|
|
1954
|
+
target_column=TARGET,
|
|
1955
|
+
id_columns=self.id_columns,
|
|
1956
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1957
|
+
task_type=self.model_task_type,
|
|
1958
|
+
cv_type=self.cv,
|
|
1959
|
+
random_state=self.random_state,
|
|
1960
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1961
|
+
logger=self.logger,
|
|
1962
|
+
bundle=self.bundle,
|
|
1963
|
+
warning_callback=self.__log_warning,
|
|
2010
1964
|
)
|
|
2011
|
-
|
|
2012
|
-
|
|
1965
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1966
|
+
if EVAL_SET_INDEX in df.columns:
|
|
1967
|
+
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
1968
|
+
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
1969
|
+
else:
|
|
1970
|
+
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
|
1971
|
+
sample_size = Dataset.FIT_SAMPLE_ROWS
|
|
2013
1972
|
|
|
2014
|
-
|
|
1973
|
+
if num_samples > threshold:
|
|
1974
|
+
self.logger.info(f"Downsampling from {num_samples} to {sample_size}")
|
|
1975
|
+
return df.sample(n=sample_size, random_state=self.random_state)
|
|
2015
1976
|
|
|
2016
|
-
|
|
2017
|
-
c
|
|
2018
|
-
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
2019
|
-
if c in enriched_Xy.columns
|
|
2020
|
-
]
|
|
1977
|
+
return df
|
|
2021
1978
|
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
|
|
1979
|
+
def __extract_train_data(
|
|
1980
|
+
self, enriched_df: pd.DataFrame, x_columns: List[str]
|
|
1981
|
+
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame]:
|
|
1982
|
+
if EVAL_SET_INDEX in enriched_df.columns:
|
|
1983
|
+
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
|
1984
|
+
else:
|
|
1985
|
+
enriched_Xy = enriched_df
|
|
1986
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
1987
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
1988
|
+
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1989
|
+
return X_sampled, y_sampled, enriched_X
|
|
1990
|
+
|
|
1991
|
+
def __extract_eval_data(
|
|
1992
|
+
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
|
1993
|
+
) -> Dict[int, Tuple]:
|
|
1994
|
+
eval_set_sampled_dict = {}
|
|
1995
|
+
|
|
1996
|
+
for idx in range(eval_set_len):
|
|
1997
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1998
|
+
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
|
1999
|
+
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
|
2000
|
+
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
|
2001
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
2002
|
+
|
|
2003
|
+
return eval_set_sampled_dict
|
|
2025
2004
|
|
|
2005
|
+
def __cache_and_return_results(
|
|
2006
|
+
self,
|
|
2007
|
+
validated_X: pd.DataFrame,
|
|
2008
|
+
validated_y: pd.Series,
|
|
2009
|
+
eval_set: Optional[List[tuple]],
|
|
2010
|
+
X_sampled: pd.DataFrame,
|
|
2011
|
+
y_sampled: pd.Series,
|
|
2012
|
+
enriched_X: pd.DataFrame,
|
|
2013
|
+
eval_set_sampled_dict: Dict[int, Tuple],
|
|
2014
|
+
) -> _SampledDataForMetrics:
|
|
2026
2015
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
2016
|
+
columns_renaming = getattr(self, "fit_columns_renaming", {})
|
|
2017
|
+
|
|
2027
2018
|
self.__cached_sampled_datasets[datasets_hash] = (
|
|
2028
2019
|
X_sampled,
|
|
2029
2020
|
y_sampled,
|
|
@@ -2160,6 +2151,7 @@ if response.status_code == 200:
|
|
|
2160
2151
|
trace_id: str,
|
|
2161
2152
|
X: pd.DataFrame,
|
|
2162
2153
|
*,
|
|
2154
|
+
y: Optional[pd.Series] = None,
|
|
2163
2155
|
exclude_features_sources: Optional[List[str]] = None,
|
|
2164
2156
|
importance_threshold: Optional[float] = None,
|
|
2165
2157
|
max_features: Optional[int] = None,
|
|
@@ -2178,8 +2170,14 @@ if response.status_code == 200:
|
|
|
2178
2170
|
self.logger.info("Start transform")
|
|
2179
2171
|
|
|
2180
2172
|
validated_X = self._validate_X(X, is_transform=True)
|
|
2173
|
+
if y is not None:
|
|
2174
|
+
validated_y = self._validate_y(validated_X, y)
|
|
2175
|
+
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
|
|
2176
|
+
else:
|
|
2177
|
+
validated_y = None
|
|
2178
|
+
df = validated_X
|
|
2181
2179
|
|
|
2182
|
-
self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
|
|
2180
|
+
self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
|
|
2183
2181
|
|
|
2184
2182
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
2185
2183
|
|
|
@@ -2222,29 +2220,27 @@ if response.status_code == 200:
|
|
|
2222
2220
|
self.logger.info(msg)
|
|
2223
2221
|
print(msg)
|
|
2224
2222
|
|
|
2225
|
-
is_demo_dataset = hash_input(
|
|
2223
|
+
is_demo_dataset = hash_input(df) in DEMO_DATASET_HASHES
|
|
2226
2224
|
|
|
2227
2225
|
columns_to_drop = [
|
|
2228
|
-
c for c in
|
|
2226
|
+
c for c in df.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2229
2227
|
]
|
|
2230
2228
|
if len(columns_to_drop) > 0:
|
|
2231
2229
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2232
2230
|
self.logger.warning(msg)
|
|
2233
2231
|
print(msg)
|
|
2234
|
-
|
|
2232
|
+
df = df.drop(columns=columns_to_drop)
|
|
2235
2233
|
|
|
2236
2234
|
search_keys = self.search_keys.copy()
|
|
2237
2235
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2238
|
-
|
|
2236
|
+
search_keys.update(
|
|
2239
2237
|
{col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
|
|
2240
2238
|
)
|
|
2241
2239
|
|
|
2242
2240
|
search_keys = self.__prepare_search_keys(
|
|
2243
|
-
|
|
2241
|
+
df, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
|
|
2244
2242
|
)
|
|
2245
2243
|
|
|
2246
|
-
df = validated_X.copy()
|
|
2247
|
-
|
|
2248
2244
|
df = self.__handle_index_search_keys(df, search_keys)
|
|
2249
2245
|
|
|
2250
2246
|
if DEFAULT_INDEX in df.columns:
|
|
@@ -2283,8 +2279,11 @@ if response.status_code == 200:
|
|
|
2283
2279
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2284
2280
|
if len(features_for_transform) > 0:
|
|
2285
2281
|
missing_features_for_transform = [
|
|
2286
|
-
columns_renaming.get(f) for f in features_for_transform if f not in df.columns
|
|
2282
|
+
columns_renaming.get(f) or f for f in features_for_transform if f not in df.columns
|
|
2287
2283
|
]
|
|
2284
|
+
if TARGET in missing_features_for_transform:
|
|
2285
|
+
raise ValidationError(self.bundle.get("missing_target_for_transform"))
|
|
2286
|
+
|
|
2288
2287
|
if len(missing_features_for_transform) > 0:
|
|
2289
2288
|
raise ValidationError(
|
|
2290
2289
|
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
|
@@ -2340,11 +2339,10 @@ if response.status_code == 200:
|
|
|
2340
2339
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2341
2340
|
df = converter.convert(df)
|
|
2342
2341
|
|
|
2343
|
-
|
|
2342
|
+
meaning_types = {}
|
|
2343
|
+
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
|
2344
|
+
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
|
2344
2345
|
|
|
2345
|
-
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2346
|
-
for col in features_for_transform:
|
|
2347
|
-
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2348
2346
|
features_not_to_pass = [
|
|
2349
2347
|
c
|
|
2350
2348
|
for c in df.columns
|
|
@@ -2353,13 +2351,12 @@ if response.status_code == 200:
|
|
|
2353
2351
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2354
2352
|
]
|
|
2355
2353
|
|
|
2356
|
-
if add_fit_system_record_id
|
|
2357
|
-
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
2354
|
+
if add_fit_system_record_id:
|
|
2358
2355
|
df = self.__add_fit_system_record_id(
|
|
2359
2356
|
df,
|
|
2360
2357
|
search_keys,
|
|
2361
2358
|
SYSTEM_RECORD_ID,
|
|
2362
|
-
|
|
2359
|
+
TARGET,
|
|
2363
2360
|
columns_renaming,
|
|
2364
2361
|
silent=True,
|
|
2365
2362
|
)
|
|
@@ -3021,7 +3018,7 @@ if response.status_code == 200:
|
|
|
3021
3018
|
msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
|
|
3022
3019
|
self.__log_warning(msg)
|
|
3023
3020
|
|
|
3024
|
-
self.__prepare_feature_importances(trace_id,
|
|
3021
|
+
self.__prepare_feature_importances(trace_id, df)
|
|
3025
3022
|
|
|
3026
3023
|
self.__show_selected_features(self.fit_search_keys)
|
|
3027
3024
|
|
|
@@ -3796,7 +3793,7 @@ if response.status_code == 200:
|
|
|
3796
3793
|
return result_train, result_eval_sets
|
|
3797
3794
|
|
|
3798
3795
|
def __prepare_feature_importances(
|
|
3799
|
-
self, trace_id: str,
|
|
3796
|
+
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
3800
3797
|
):
|
|
3801
3798
|
if self._search_task is None:
|
|
3802
3799
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -3807,6 +3804,8 @@ if response.status_code == 200:
|
|
|
3807
3804
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
3808
3805
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3809
3806
|
|
|
3807
|
+
df = df.rename(columns=original_names_dict)
|
|
3808
|
+
|
|
3810
3809
|
self.feature_names_ = []
|
|
3811
3810
|
self.dropped_client_feature_names_ = []
|
|
3812
3811
|
self.feature_importances_ = []
|
|
@@ -3825,7 +3824,7 @@ if response.status_code == 200:
|
|
|
3825
3824
|
if feature_meta.name in original_names_dict.keys():
|
|
3826
3825
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3827
3826
|
|
|
3828
|
-
is_client_feature = feature_meta.name in
|
|
3827
|
+
is_client_feature = feature_meta.name in df.columns
|
|
3829
3828
|
|
|
3830
3829
|
# TODO make a decision about selected features based on special flag from mlb
|
|
3831
3830
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
|
@@ -3845,7 +3844,7 @@ if response.status_code == 200:
|
|
|
3845
3844
|
self.feature_names_.append(feature_meta.name)
|
|
3846
3845
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3847
3846
|
|
|
3848
|
-
df_for_sample = features_df if feature_meta.name in features_df.columns else
|
|
3847
|
+
df_for_sample = features_df if feature_meta.name in features_df.columns else df
|
|
3849
3848
|
feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
|
|
3850
3849
|
features_info.append(feature_info.to_row(self.bundle))
|
|
3851
3850
|
features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
|
|
@@ -136,6 +136,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
|
136
136
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
137
137
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
138
138
|
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
|
139
|
+
missing_target_for_transform=Search contains features on target. Please add y to the call and try again
|
|
139
140
|
missing_id_column=Id column {} not found in X
|
|
140
141
|
# target validation
|
|
141
142
|
empty_target=Target is empty in all rows
|
upgini/search_task.py
CHANGED
|
@@ -168,7 +168,13 @@ class SearchTask:
|
|
|
168
168
|
for meta in self.provider_metadata_v2:
|
|
169
169
|
if meta.features_used_for_embeddings is not None:
|
|
170
170
|
features_for_transform.update(meta.features_used_for_embeddings)
|
|
171
|
-
|
|
171
|
+
if meta.generated_features:
|
|
172
|
+
features_for_transform.update(
|
|
173
|
+
c.original_name
|
|
174
|
+
for f in meta.generated_features
|
|
175
|
+
for c in f.base_columns
|
|
176
|
+
if c.ads_definition_id is None
|
|
177
|
+
)
|
|
172
178
|
return list(features_for_transform)
|
|
173
179
|
|
|
174
180
|
def get_shuffle_kfold(self) -> Optional[bool]:
|
upgini/utils/feature_info.py
CHANGED
|
@@ -88,8 +88,11 @@ class FeatureInfo:
|
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
|
|
91
|
-
if data is not None and feature_meta.name in data.columns:
|
|
92
|
-
|
|
91
|
+
if data is not None and len(data) > 0 and feature_meta.name in data.columns:
|
|
92
|
+
if len(data) > 3:
|
|
93
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
94
|
+
else:
|
|
95
|
+
feature_sample = data[feature_meta.name].dropna().unique().tolist()
|
|
93
96
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
94
97
|
feature_sample = [round(f, 4) for f in feature_sample]
|
|
95
98
|
feature_sample = [str(f) for f in feature_sample]
|
|
@@ -123,7 +126,11 @@ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) ->
|
|
|
123
126
|
|
|
124
127
|
|
|
125
128
|
def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
126
|
-
|
|
129
|
+
providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
|
|
130
|
+
if providers:
|
|
131
|
+
return ", ".join(providers)
|
|
132
|
+
else:
|
|
133
|
+
return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
|
|
127
134
|
|
|
128
135
|
|
|
129
136
|
def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
@@ -137,13 +144,17 @@ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> st
|
|
|
137
144
|
|
|
138
145
|
|
|
139
146
|
def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
+
sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
|
|
148
|
+
if sources:
|
|
149
|
+
return ", ".join(sources)
|
|
150
|
+
else:
|
|
151
|
+
return feature_meta.data_source or (
|
|
152
|
+
LLM_SOURCE
|
|
153
|
+
if not feature_meta.name.endswith("_country")
|
|
154
|
+
and not feature_meta.name.endswith("_postal_code")
|
|
155
|
+
and not is_client_feature
|
|
156
|
+
else ""
|
|
157
|
+
)
|
|
147
158
|
|
|
148
159
|
|
|
149
160
|
def _list_or_single(lst: List[str], single: str):
|
|
@@ -161,7 +172,7 @@ def _to_anchor(link: str, value: str) -> str:
|
|
|
161
172
|
return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
|
|
162
173
|
|
|
163
174
|
|
|
164
|
-
def _make_links(names: List[str], links: List[str]):
|
|
175
|
+
def _make_links(names: List[str], links: List[str]) -> str:
|
|
165
176
|
all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
|
|
166
177
|
return ",".join(all_links)
|
|
167
178
|
|
upgini/utils/mstats.py
CHANGED
|
@@ -118,7 +118,7 @@ def spearmanr(
|
|
|
118
118
|
# - dof: degrees of freedom
|
|
119
119
|
# - t_stat: t-statistic
|
|
120
120
|
# - alternative: 'two-sided', 'greater', 'less'
|
|
121
|
-
def compute_t_pvalue(t_stat, dof, alternative=
|
|
121
|
+
def compute_t_pvalue(t_stat, dof, alternative="two-sided"):
|
|
122
122
|
from scipy.stats import t
|
|
123
123
|
|
|
124
124
|
if alternative == "two-sided":
|
upgini/utils/sort.py
CHANGED
|
@@ -49,7 +49,7 @@ def sort_columns(
|
|
|
49
49
|
target = target_column if isinstance(target_column, pd.Series) else df[target_column]
|
|
50
50
|
target = prepare_target(target, model_task_type)
|
|
51
51
|
sort_dict = get_sort_columns_dict(
|
|
52
|
-
df[sorted_keys + other_columns], target, sorted_keys,
|
|
52
|
+
df[sorted_keys + other_columns], target, sorted_keys, sort_all_columns=sort_all_columns
|
|
53
53
|
)
|
|
54
54
|
other_columns = [c for c in other_columns if c in sort_dict]
|
|
55
55
|
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
@@ -60,7 +60,6 @@ def get_sort_columns_dict(
|
|
|
60
60
|
df: pd.DataFrame,
|
|
61
61
|
target: pd.Series,
|
|
62
62
|
sorted_keys: List[str],
|
|
63
|
-
omit_nan: bool,
|
|
64
63
|
n_jobs: Optional[int] = None,
|
|
65
64
|
sort_all_columns: bool = False,
|
|
66
65
|
) -> Dict[str, Any]:
|
|
@@ -78,6 +77,13 @@ def get_sort_columns_dict(
|
|
|
78
77
|
return {}
|
|
79
78
|
|
|
80
79
|
df = df[columns_for_sort]
|
|
80
|
+
df_with_target = pd.concat([df, target], axis=1)
|
|
81
|
+
# Drop rows where target is NaN
|
|
82
|
+
df_with_target = df_with_target.loc[~target.isna()]
|
|
83
|
+
df = df_with_target.iloc[:, :-1]
|
|
84
|
+
target = df_with_target.iloc[:, -1]
|
|
85
|
+
df = df.fillna(df.mean())
|
|
86
|
+
omit_nan = False
|
|
81
87
|
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
82
88
|
df = np.asarray(df, dtype=np.float32)
|
|
83
89
|
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.68a3818.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=B8ku0HzP4G2N6EyFXdX43ZRi57azPbbOINogoH1dGG4,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=KBTdADF7_Wj3uDROYdevukOk6R8LVQw47gJkH4M1_iQ,204435
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
10
10
|
upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
|
|
11
|
-
upgini/search_task.py,sha256=
|
|
11
|
+
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -16,18 +16,19 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
|
18
18
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
|
|
20
20
|
upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
|
|
21
21
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
22
22
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
|
23
23
|
upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
|
|
24
|
+
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
|
24
25
|
upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
|
|
25
26
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
|
26
27
|
upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
|
|
27
|
-
upgini/autofe/timeseries/cross.py,sha256=
|
|
28
|
+
upgini/autofe/timeseries/cross.py,sha256=qdoMGKg0auoYKwu4Vz8V3XDs_6-5j9sE4gcwfAR41Ws,5231
|
|
28
29
|
upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
|
|
29
30
|
upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
|
|
30
|
-
upgini/autofe/timeseries/roll.py,sha256=
|
|
31
|
+
upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
|
|
31
32
|
upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
|
|
32
33
|
upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
|
|
33
34
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -38,7 +39,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
38
39
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
39
40
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
40
41
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
42
|
+
upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
|
|
42
43
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
43
44
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
45
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -56,21 +57,21 @@ upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuM
|
|
|
56
57
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
57
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
58
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
59
|
-
upgini/utils/feature_info.py,sha256=
|
|
60
|
+
upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
|
|
60
61
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
61
62
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
62
63
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
63
|
-
upgini/utils/mstats.py,sha256=
|
|
64
|
+
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
65
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
65
66
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
66
67
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
67
68
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
68
|
-
upgini/utils/sort.py,sha256=
|
|
69
|
+
upgini/utils/sort.py,sha256=VDXgZObIVAuGzXlAEejlKCNQcHmN5pN2bMou58sDKFI,6729
|
|
69
70
|
upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
|
|
70
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
71
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
72
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
73
|
-
upgini-1.2.
|
|
74
|
-
upgini-1.2.
|
|
75
|
-
upgini-1.2.
|
|
76
|
-
upgini-1.2.
|
|
74
|
+
upgini-1.2.68a3818.dev1.dist-info/METADATA,sha256=b70LVYxQjLh3v0j-pbeT-PWuf065TUhpgQxt_prM2Oo,49123
|
|
75
|
+
upgini-1.2.68a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
76
|
+
upgini-1.2.68a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
77
|
+
upgini-1.2.68a3818.dev1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|