upgini 1.2.68a3818.dev3__py3-none-any.whl → 1.2.68a3832.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +21 -15
- upgini/autofe/feature.py +1 -5
- upgini/autofe/timeseries/base.py +1 -1
- upgini/autofe/timeseries/cross.py +12 -20
- upgini/autofe/timeseries/roll.py +7 -2
- upgini/features_enricher.py +149 -142
- upgini/metrics.py +71 -27
- upgini/resource_bundle/strings.properties +1 -1
- upgini/search_task.py +1 -7
- upgini/utils/mstats.py +1 -1
- {upgini-1.2.68a3818.dev3.dist-info → upgini-1.2.68a3832.dev1.dist-info}/METADATA +3 -2
- {upgini-1.2.68a3818.dev3.dist-info → upgini-1.2.68a3832.dev1.dist-info}/RECORD +15 -16
- {upgini-1.2.68a3818.dev3.dist-info → upgini-1.2.68a3832.dev1.dist-info}/WHEEL +1 -1
- upgini/autofe/utils.py +0 -83
- {upgini-1.2.68a3818.dev3.dist-info → upgini-1.2.68a3832.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.68a3832.dev1"
|
upgini/autofe/date.py
CHANGED
|
@@ -8,7 +8,6 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
10
|
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
11
|
-
from upgini.autofe.utils import pydantic_validator
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
def get_pydantic_version():
|
|
@@ -210,14 +209,6 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
|
210
209
|
|
|
211
210
|
return cls(diff_unit=diff_unit, lower_bound=lower_bound, upper_bound=upper_bound, aggregation=aggregation)
|
|
212
211
|
|
|
213
|
-
def get_params(self) -> Dict[str, Optional[str]]:
|
|
214
|
-
res = super().get_params()
|
|
215
|
-
if self.lower_bound is not None:
|
|
216
|
-
res["lower_bound"] = str(self.lower_bound)
|
|
217
|
-
if self.upper_bound is not None:
|
|
218
|
-
res["upper_bound"] = str(self.upper_bound)
|
|
219
|
-
return res
|
|
220
|
-
|
|
221
212
|
def _agg(self, x):
|
|
222
213
|
x = x[
|
|
223
214
|
(x >= (self.lower_bound if self.lower_bound is not None else -np.inf))
|
|
@@ -278,17 +269,32 @@ class DatePercentile(DatePercentileBase):
|
|
|
278
269
|
{
|
|
279
270
|
"zero_month": self.zero_month,
|
|
280
271
|
"zero_year": self.zero_year,
|
|
281
|
-
"zero_bounds":
|
|
272
|
+
"zero_bounds": self.zero_bounds,
|
|
282
273
|
"step": self.step,
|
|
283
274
|
}
|
|
284
275
|
)
|
|
285
276
|
return res
|
|
286
277
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
278
|
+
# Check Pydantic version
|
|
279
|
+
if get_pydantic_version() >= 2:
|
|
280
|
+
# Use @field_validator for Pydantic 2.x
|
|
281
|
+
from pydantic import field_validator
|
|
282
|
+
|
|
283
|
+
@field_validator("zero_bounds", mode="before")
|
|
284
|
+
def parse_zero_bounds(cls, value):
|
|
285
|
+
if isinstance(value, str):
|
|
286
|
+
return json.loads(value)
|
|
287
|
+
return value
|
|
288
|
+
|
|
289
|
+
else:
|
|
290
|
+
# Use @validator for Pydantic 1.x
|
|
291
|
+
from pydantic import validator
|
|
292
|
+
|
|
293
|
+
@validator("zero_bounds", pre=True)
|
|
294
|
+
def parse_zero_bounds(cls, value):
|
|
295
|
+
if isinstance(value, str):
|
|
296
|
+
return json.loads(value)
|
|
297
|
+
return value
|
|
292
298
|
|
|
293
299
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
|
294
300
|
months = date_col.dt.month
|
upgini/autofe/feature.py
CHANGED
|
@@ -112,11 +112,7 @@ class Feature:
|
|
|
112
112
|
|
|
113
113
|
def get_hash(self) -> str:
|
|
114
114
|
return hashlib.sha256(
|
|
115
|
-
"_".join(
|
|
116
|
-
[self.op.get_hash_component()]
|
|
117
|
-
+ [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
|
|
118
|
-
+ [ch.get_display_name() for ch in self.children]
|
|
119
|
-
).encode("utf-8")
|
|
115
|
+
"_".join([self.op.get_hash_component()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
|
|
120
116
|
).hexdigest()[:8]
|
|
121
117
|
|
|
122
118
|
def set_alias(self, alias: str) -> "Feature":
|
upgini/autofe/timeseries/base.py
CHANGED
|
@@ -1,13 +1,16 @@
|
|
|
1
|
-
import json
|
|
2
1
|
from typing import Dict, List, Optional
|
|
3
2
|
|
|
4
3
|
import numpy as np
|
|
5
4
|
import pandas as pd
|
|
6
5
|
|
|
6
|
+
try:
|
|
7
|
+
from pydantic import field_validator as validator # V2
|
|
8
|
+
except ImportError:
|
|
9
|
+
from pydantic import validator # V1
|
|
10
|
+
|
|
7
11
|
from upgini.autofe.all_operators import find_op
|
|
8
12
|
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
9
13
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
10
|
-
from upgini.autofe.utils import pydantic_validator
|
|
11
14
|
|
|
12
15
|
|
|
13
16
|
class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
@@ -17,22 +20,11 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
|
17
20
|
left_descriptor: List[str] = []
|
|
18
21
|
right_descriptor: List[str] = []
|
|
19
22
|
|
|
20
|
-
@
|
|
23
|
+
@validator("descriptor_indices")
|
|
24
|
+
@classmethod
|
|
21
25
|
def validate_descriptor_indices(cls, v):
|
|
22
|
-
if
|
|
23
|
-
|
|
24
|
-
return v
|
|
25
|
-
|
|
26
|
-
@pydantic_validator("left_descriptor", "right_descriptor", mode="before")
|
|
27
|
-
def parse_descriptors(cls, v):
|
|
28
|
-
if isinstance(v, str):
|
|
29
|
-
return json.loads(v)
|
|
30
|
-
return v
|
|
31
|
-
|
|
32
|
-
@pydantic_validator("interaction_op", mode="before")
|
|
33
|
-
def validate_interaction_op(cls, v):
|
|
34
|
-
if isinstance(v, str):
|
|
35
|
-
return find_op(v)
|
|
26
|
+
if not v:
|
|
27
|
+
raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
|
|
36
28
|
return v
|
|
37
29
|
|
|
38
30
|
def __init__(self, **data):
|
|
@@ -96,9 +88,9 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
|
|
|
96
88
|
res.update(
|
|
97
89
|
{
|
|
98
90
|
"interaction_op": self._get_interaction_op_name(),
|
|
99
|
-
"descriptor_indices":
|
|
100
|
-
"left_descriptor":
|
|
101
|
-
"right_descriptor":
|
|
91
|
+
"descriptor_indices": self.descriptor_indices,
|
|
92
|
+
"left_descriptor": self.left_descriptor,
|
|
93
|
+
"right_descriptor": self.right_descriptor,
|
|
102
94
|
}
|
|
103
95
|
)
|
|
104
96
|
return res
|
upgini/autofe/timeseries/roll.py
CHANGED
|
@@ -3,7 +3,6 @@ from typing import Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
from upgini.autofe.operator import ParametrizedOperator
|
|
5
5
|
from upgini.autofe.timeseries.base import TimeSeriesBase
|
|
6
|
-
from upgini.autofe.utils import pydantic_validator
|
|
7
6
|
|
|
8
7
|
# Roll aggregation functions
|
|
9
8
|
roll_aggregations = {
|
|
@@ -13,13 +12,19 @@ roll_aggregations = {
|
|
|
13
12
|
"iqr": lambda x: x.quantile(0.75) - x.quantile(0.25),
|
|
14
13
|
}
|
|
15
14
|
|
|
15
|
+
try:
|
|
16
|
+
from pydantic import field_validator as validator # V2
|
|
17
|
+
except ImportError:
|
|
18
|
+
from pydantic import validator # V1
|
|
19
|
+
|
|
16
20
|
|
|
17
21
|
class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
18
22
|
aggregation: str
|
|
19
23
|
window_size: int = 1
|
|
20
24
|
window_unit: str = "D"
|
|
21
25
|
|
|
22
|
-
@
|
|
26
|
+
@validator("window_unit")
|
|
27
|
+
@classmethod
|
|
23
28
|
def validate_window_unit(cls, v: str) -> str:
|
|
24
29
|
try:
|
|
25
30
|
pd.tseries.frequencies.to_offset(v)
|
upgini/features_enricher.py
CHANGED
|
@@ -702,7 +702,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
702
702
|
def transform(
|
|
703
703
|
self,
|
|
704
704
|
X: pd.DataFrame,
|
|
705
|
-
y: Optional[pd.Series] = None,
|
|
706
705
|
*args,
|
|
707
706
|
exclude_features_sources: Optional[List[str]] = None,
|
|
708
707
|
keep_input: bool = True,
|
|
@@ -767,7 +766,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
767
766
|
result, _, _ = self.__inner_transform(
|
|
768
767
|
trace_id,
|
|
769
768
|
X,
|
|
770
|
-
y=y,
|
|
771
769
|
exclude_features_sources=exclude_features_sources,
|
|
772
770
|
importance_threshold=importance_threshold,
|
|
773
771
|
max_features=max_features,
|
|
@@ -1684,6 +1682,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1684
1682
|
validated_X,
|
|
1685
1683
|
validated_y,
|
|
1686
1684
|
eval_set,
|
|
1685
|
+
is_demo_dataset,
|
|
1687
1686
|
exclude_features_sources,
|
|
1688
1687
|
trace_id,
|
|
1689
1688
|
progress_bar,
|
|
@@ -1874,147 +1873,158 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1874
1873
|
validated_X: pd.DataFrame,
|
|
1875
1874
|
validated_y: pd.Series,
|
|
1876
1875
|
eval_set: Optional[List[tuple]],
|
|
1876
|
+
is_demo_dataset: bool,
|
|
1877
1877
|
exclude_features_sources: Optional[List[str]],
|
|
1878
1878
|
trace_id: str,
|
|
1879
1879
|
progress_bar: Optional[ProgressBar],
|
|
1880
1880
|
progress_callback: Optional[Callable[[SearchProgress], Any]],
|
|
1881
1881
|
) -> _SampledDataForMetrics:
|
|
1882
|
-
|
|
1883
|
-
|
|
1884
|
-
|
|
1885
|
-
|
|
1886
|
-
|
|
1887
|
-
|
|
1888
|
-
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
|
|
1882
|
+
eval_set_sampled_dict = {}
|
|
1883
|
+
if eval_set is not None:
|
|
1884
|
+
self.logger.info("Transform with eval_set")
|
|
1885
|
+
# concatenate X and eval_set with eval_set_index
|
|
1886
|
+
df = validated_X.copy()
|
|
1887
|
+
df[TARGET] = validated_y
|
|
1888
|
+
df[EVAL_SET_INDEX] = 0
|
|
1889
|
+
for idx, eval_pair in enumerate(eval_set):
|
|
1890
|
+
eval_x, eval_y = self._validate_eval_set_pair(validated_X, eval_pair)
|
|
1891
|
+
eval_df_with_index = eval_x.copy()
|
|
1892
|
+
eval_df_with_index[TARGET] = eval_y
|
|
1893
|
+
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
|
1894
|
+
df = pd.concat([df, eval_df_with_index])
|
|
1895
|
+
|
|
1896
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1897
|
+
|
|
1898
|
+
# downsample if need to eval_set threshold
|
|
1899
|
+
num_samples = _num_samples(df)
|
|
1900
|
+
force_downsampling = (
|
|
1901
|
+
not self.disable_force_downsampling
|
|
1902
|
+
and self.columns_for_online_api is not None
|
|
1903
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1904
|
+
)
|
|
1905
|
+
# TODO: check that system_record_id was added before this step
|
|
1906
|
+
if force_downsampling:
|
|
1907
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1908
|
+
df = balance_undersample_forced(
|
|
1909
|
+
df=df,
|
|
1910
|
+
target_column=TARGET,
|
|
1911
|
+
id_columns=self.id_columns,
|
|
1912
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1913
|
+
task_type=self.model_task_type,
|
|
1914
|
+
cv_type=self.cv,
|
|
1915
|
+
random_state=self.random_state,
|
|
1916
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1917
|
+
logger=self.logger,
|
|
1918
|
+
bundle=self.bundle,
|
|
1919
|
+
warning_callback=self.__log_warning,
|
|
1920
|
+
)
|
|
1921
|
+
elif num_samples > Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD:
|
|
1922
|
+
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS}")
|
|
1923
|
+
df = df.sample(n=Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS, random_state=self.random_state)
|
|
1892
1924
|
|
|
1893
|
-
|
|
1894
|
-
trace_id,
|
|
1895
|
-
X=df.drop(columns=[TARGET]),
|
|
1896
|
-
y=df[TARGET],
|
|
1897
|
-
exclude_features_sources=exclude_features_sources,
|
|
1898
|
-
silent_mode=True,
|
|
1899
|
-
metrics_calculation=True,
|
|
1900
|
-
progress_bar=progress_bar,
|
|
1901
|
-
progress_callback=progress_callback,
|
|
1902
|
-
add_fit_system_record_id=True,
|
|
1903
|
-
)
|
|
1904
|
-
if enriched_df is None:
|
|
1905
|
-
return None
|
|
1925
|
+
eval_set_sampled_dict = {}
|
|
1906
1926
|
|
|
1907
|
-
|
|
1908
|
-
|
|
1909
|
-
for c in (validated_X.columns.tolist() + self.fit_generated_features + [SYSTEM_RECORD_ID])
|
|
1910
|
-
if c in enriched_df.columns
|
|
1911
|
-
]
|
|
1927
|
+
tmp_target_name = "__target"
|
|
1928
|
+
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1912
1929
|
|
|
1913
|
-
|
|
1914
|
-
|
|
1915
|
-
|
|
1916
|
-
|
|
1930
|
+
enriched_df, columns_renaming, generated_features = self.__inner_transform(
|
|
1931
|
+
trace_id,
|
|
1932
|
+
df,
|
|
1933
|
+
exclude_features_sources=exclude_features_sources,
|
|
1934
|
+
silent_mode=True,
|
|
1935
|
+
metrics_calculation=True,
|
|
1936
|
+
progress_bar=progress_bar,
|
|
1937
|
+
progress_callback=progress_callback,
|
|
1938
|
+
add_fit_system_record_id=True,
|
|
1939
|
+
target_name=tmp_target_name,
|
|
1940
|
+
)
|
|
1941
|
+
if enriched_df is None:
|
|
1942
|
+
return None
|
|
1917
1943
|
|
|
1918
|
-
|
|
1919
|
-
return self.__cache_and_return_results(
|
|
1920
|
-
validated_X, validated_y, eval_set, X_sampled, y_sampled, enriched_X, eval_set_sampled_dict
|
|
1921
|
-
)
|
|
1944
|
+
enriched_df = enriched_df.rename(columns={tmp_target_name: TARGET})
|
|
1922
1945
|
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
|
|
1927
|
-
|
|
1928
|
-
if eval_set is None:
|
|
1929
|
-
return df
|
|
1946
|
+
x_columns = [
|
|
1947
|
+
c
|
|
1948
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
1949
|
+
if c in enriched_df.columns
|
|
1950
|
+
]
|
|
1930
1951
|
|
|
1931
|
-
|
|
1952
|
+
enriched_Xy = enriched_df.query(f"{EVAL_SET_INDEX} == 0")
|
|
1953
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
1954
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
1955
|
+
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1956
|
+
enriched_X_columns = enriched_X.columns.tolist()
|
|
1932
1957
|
|
|
1933
|
-
|
|
1934
|
-
|
|
1935
|
-
|
|
1936
|
-
|
|
1937
|
-
|
|
1938
|
-
|
|
1958
|
+
for idx in range(len(eval_set)):
|
|
1959
|
+
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1960
|
+
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
|
1961
|
+
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
|
1962
|
+
enriched_eval_x = enriched_eval_xy[enriched_X_columns].copy()
|
|
1963
|
+
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1964
|
+
else:
|
|
1965
|
+
self.logger.info("Transform without eval_set")
|
|
1966
|
+
df = validated_X.copy()
|
|
1939
1967
|
|
|
1940
|
-
|
|
1968
|
+
df[TARGET] = validated_y
|
|
1941
1969
|
|
|
1942
|
-
|
|
1943
|
-
num_samples = _num_samples(df)
|
|
1944
|
-
force_downsampling = (
|
|
1945
|
-
not self.disable_force_downsampling
|
|
1946
|
-
and self.columns_for_online_api is not None
|
|
1947
|
-
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1948
|
-
)
|
|
1970
|
+
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1949
1971
|
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
id_columns=self.id_columns,
|
|
1956
|
-
date_column=self._get_date_column(self.search_keys),
|
|
1957
|
-
task_type=self.model_task_type,
|
|
1958
|
-
cv_type=self.cv,
|
|
1959
|
-
random_state=self.random_state,
|
|
1960
|
-
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1961
|
-
logger=self.logger,
|
|
1962
|
-
bundle=self.bundle,
|
|
1963
|
-
warning_callback=self.__log_warning,
|
|
1972
|
+
num_samples = _num_samples(df)
|
|
1973
|
+
force_downsampling = (
|
|
1974
|
+
not self.disable_force_downsampling
|
|
1975
|
+
and self.columns_for_online_api is not None
|
|
1976
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1964
1977
|
)
|
|
1965
|
-
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1966
|
-
if EVAL_SET_INDEX in df.columns:
|
|
1967
|
-
threshold = Dataset.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
1968
|
-
sample_size = Dataset.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
1969
|
-
else:
|
|
1970
|
-
threshold = Dataset.FIT_SAMPLE_THRESHOLD
|
|
1971
|
-
sample_size = Dataset.FIT_SAMPLE_ROWS
|
|
1972
1978
|
|
|
1973
|
-
if
|
|
1974
|
-
self.logger.info(f"
|
|
1975
|
-
|
|
1979
|
+
if force_downsampling:
|
|
1980
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1981
|
+
df = balance_undersample_forced(
|
|
1982
|
+
df=df,
|
|
1983
|
+
target_column=TARGET,
|
|
1984
|
+
id_columns=self.id_columns,
|
|
1985
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1986
|
+
task_type=self.model_task_type,
|
|
1987
|
+
cv_type=self.cv,
|
|
1988
|
+
random_state=self.random_state,
|
|
1989
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1990
|
+
logger=self.logger,
|
|
1991
|
+
bundle=self.bundle,
|
|
1992
|
+
warning_callback=self.__log_warning,
|
|
1993
|
+
)
|
|
1994
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1995
|
+
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1996
|
+
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1976
1997
|
|
|
1977
|
-
|
|
1998
|
+
tmp_target_name = "__target"
|
|
1999
|
+
df = df.rename(columns={TARGET: tmp_target_name})
|
|
1978
2000
|
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
2001
|
+
enriched_Xy, columns_renaming, generated_features = self.__inner_transform(
|
|
2002
|
+
trace_id,
|
|
2003
|
+
df,
|
|
2004
|
+
exclude_features_sources=exclude_features_sources,
|
|
2005
|
+
silent_mode=True,
|
|
2006
|
+
metrics_calculation=True,
|
|
2007
|
+
progress_bar=progress_bar,
|
|
2008
|
+
progress_callback=progress_callback,
|
|
2009
|
+
add_fit_system_record_id=True,
|
|
2010
|
+
target_name=tmp_target_name,
|
|
2011
|
+
)
|
|
2012
|
+
if enriched_Xy is None:
|
|
2013
|
+
return None
|
|
1990
2014
|
|
|
1991
|
-
|
|
1992
|
-
self, enriched_df: pd.DataFrame, x_columns: List[str], enriched_X_columns: List[str], eval_set_len: int
|
|
1993
|
-
) -> Dict[int, Tuple]:
|
|
1994
|
-
eval_set_sampled_dict = {}
|
|
2015
|
+
enriched_Xy = enriched_Xy.rename(columns={tmp_target_name: TARGET})
|
|
1995
2016
|
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
2017
|
+
x_columns = [
|
|
2018
|
+
c
|
|
2019
|
+
for c in (validated_X.columns.tolist() + generated_features + [SYSTEM_RECORD_ID])
|
|
2020
|
+
if c in enriched_Xy.columns
|
|
2021
|
+
]
|
|
2002
2022
|
|
|
2003
|
-
|
|
2023
|
+
X_sampled = enriched_Xy[x_columns].copy()
|
|
2024
|
+
y_sampled = enriched_Xy[TARGET].copy()
|
|
2025
|
+
enriched_X = enriched_Xy.drop(columns=TARGET)
|
|
2004
2026
|
|
|
2005
|
-
def __cache_and_return_results(
|
|
2006
|
-
self,
|
|
2007
|
-
validated_X: pd.DataFrame,
|
|
2008
|
-
validated_y: pd.Series,
|
|
2009
|
-
eval_set: Optional[List[tuple]],
|
|
2010
|
-
X_sampled: pd.DataFrame,
|
|
2011
|
-
y_sampled: pd.Series,
|
|
2012
|
-
enriched_X: pd.DataFrame,
|
|
2013
|
-
eval_set_sampled_dict: Dict[int, Tuple],
|
|
2014
|
-
) -> _SampledDataForMetrics:
|
|
2015
2027
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
|
2016
|
-
columns_renaming = getattr(self, "fit_columns_renaming", {})
|
|
2017
|
-
|
|
2018
2028
|
self.__cached_sampled_datasets[datasets_hash] = (
|
|
2019
2029
|
X_sampled,
|
|
2020
2030
|
y_sampled,
|
|
@@ -2151,7 +2161,6 @@ if response.status_code == 200:
|
|
|
2151
2161
|
trace_id: str,
|
|
2152
2162
|
X: pd.DataFrame,
|
|
2153
2163
|
*,
|
|
2154
|
-
y: Optional[pd.Series] = None,
|
|
2155
2164
|
exclude_features_sources: Optional[List[str]] = None,
|
|
2156
2165
|
importance_threshold: Optional[float] = None,
|
|
2157
2166
|
max_features: Optional[int] = None,
|
|
@@ -2170,14 +2179,8 @@ if response.status_code == 200:
|
|
|
2170
2179
|
self.logger.info("Start transform")
|
|
2171
2180
|
|
|
2172
2181
|
validated_X = self._validate_X(X, is_transform=True)
|
|
2173
|
-
if y is not None:
|
|
2174
|
-
validated_y = self._validate_y(validated_X, y)
|
|
2175
|
-
df = self.__combine_train_and_eval_sets(validated_X, validated_y, eval_set=None)
|
|
2176
|
-
else:
|
|
2177
|
-
validated_y = None
|
|
2178
|
-
df = validated_X
|
|
2179
2182
|
|
|
2180
|
-
self.__log_debug_information(validated_X,
|
|
2183
|
+
self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
|
|
2181
2184
|
|
|
2182
2185
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
2183
2186
|
|
|
@@ -2220,27 +2223,29 @@ if response.status_code == 200:
|
|
|
2220
2223
|
self.logger.info(msg)
|
|
2221
2224
|
print(msg)
|
|
2222
2225
|
|
|
2223
|
-
is_demo_dataset = hash_input(
|
|
2226
|
+
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2224
2227
|
|
|
2225
2228
|
columns_to_drop = [
|
|
2226
|
-
c for c in
|
|
2229
|
+
c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2227
2230
|
]
|
|
2228
2231
|
if len(columns_to_drop) > 0:
|
|
2229
2232
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
2230
2233
|
self.logger.warning(msg)
|
|
2231
2234
|
print(msg)
|
|
2232
|
-
|
|
2235
|
+
validated_X = validated_X.drop(columns=columns_to_drop)
|
|
2233
2236
|
|
|
2234
2237
|
search_keys = self.search_keys.copy()
|
|
2235
2238
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
|
2236
|
-
search_keys.update(
|
|
2239
|
+
self.search_keys.update(
|
|
2237
2240
|
{col: SearchKey.CUSTOM_KEY for col in self.id_columns if col not in self.search_keys}
|
|
2238
2241
|
)
|
|
2239
2242
|
|
|
2240
2243
|
search_keys = self.__prepare_search_keys(
|
|
2241
|
-
|
|
2244
|
+
validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
|
|
2242
2245
|
)
|
|
2243
2246
|
|
|
2247
|
+
df = validated_X.copy()
|
|
2248
|
+
|
|
2244
2249
|
df = self.__handle_index_search_keys(df, search_keys)
|
|
2245
2250
|
|
|
2246
2251
|
if DEFAULT_INDEX in df.columns:
|
|
@@ -2279,11 +2284,8 @@ if response.status_code == 200:
|
|
|
2279
2284
|
features_for_transform = self._search_task.get_features_for_transform() or []
|
|
2280
2285
|
if len(features_for_transform) > 0:
|
|
2281
2286
|
missing_features_for_transform = [
|
|
2282
|
-
columns_renaming.get(f)
|
|
2287
|
+
columns_renaming.get(f) for f in features_for_transform if f not in df.columns
|
|
2283
2288
|
]
|
|
2284
|
-
if TARGET in missing_features_for_transform:
|
|
2285
|
-
raise ValidationError(self.bundle.get("missing_target_for_transform"))
|
|
2286
|
-
|
|
2287
2289
|
if len(missing_features_for_transform) > 0:
|
|
2288
2290
|
raise ValidationError(
|
|
2289
2291
|
self.bundle.get("missing_features_for_transform").format(missing_features_for_transform)
|
|
@@ -2339,10 +2341,11 @@ if response.status_code == 200:
|
|
|
2339
2341
|
converter = PostalCodeSearchKeyConverter(postal_code)
|
|
2340
2342
|
df = converter.convert(df)
|
|
2341
2343
|
|
|
2342
|
-
|
|
2343
|
-
meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
|
|
2344
|
-
meaning_types.update({col: key.value for col, key in search_keys.items()})
|
|
2344
|
+
# generated_features = [f for f in generated_features if f in self.fit_generated_features]
|
|
2345
2345
|
|
|
2346
|
+
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
2347
|
+
for col in features_for_transform:
|
|
2348
|
+
meaning_types[col] = FileColumnMeaningType.FEATURE
|
|
2346
2349
|
features_not_to_pass = [
|
|
2347
2350
|
c
|
|
2348
2351
|
for c in df.columns
|
|
@@ -2351,12 +2354,13 @@ if response.status_code == 200:
|
|
|
2351
2354
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2352
2355
|
]
|
|
2353
2356
|
|
|
2354
|
-
if add_fit_system_record_id:
|
|
2357
|
+
if add_fit_system_record_id and target_name is not None:
|
|
2358
|
+
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
2355
2359
|
df = self.__add_fit_system_record_id(
|
|
2356
2360
|
df,
|
|
2357
2361
|
search_keys,
|
|
2358
2362
|
SYSTEM_RECORD_ID,
|
|
2359
|
-
|
|
2363
|
+
reversed_columns_renaming.get(target_name, target_name),
|
|
2360
2364
|
columns_renaming,
|
|
2361
2365
|
silent=True,
|
|
2362
2366
|
)
|
|
@@ -4066,7 +4070,10 @@ if response.status_code == 200:
|
|
|
4066
4070
|
)
|
|
4067
4071
|
|
|
4068
4072
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
4069
|
-
|
|
4073
|
+
if self.__is_registered:
|
|
4074
|
+
msg = self.bundle.get("only_custom_keys")
|
|
4075
|
+
else:
|
|
4076
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4070
4077
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4071
4078
|
raise ValidationError(msg)
|
|
4072
4079
|
|
upgini/metrics.py
CHANGED
|
@@ -11,7 +11,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
|
11
11
|
import catboost
|
|
12
12
|
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
14
|
-
from
|
|
14
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
15
15
|
from numpy import log1p
|
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
|
17
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -88,11 +88,18 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
88
88
|
|
|
89
89
|
LIGHTGBM_PARAMS = {
|
|
90
90
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
91
|
-
"num_leaves": 16,
|
|
91
|
+
# "num_leaves": 16,
|
|
92
|
+
# "n_estimators": 150,
|
|
93
|
+
# "min_child_weight": 1,
|
|
92
94
|
"max_depth": 4,
|
|
93
|
-
"
|
|
95
|
+
"max_cat_threshold": 80,
|
|
96
|
+
"min_data_per_group": 25,
|
|
97
|
+
"num_boost_round": 150,
|
|
98
|
+
"cat_l2": 10,
|
|
99
|
+
"cat_smooth": 12,
|
|
94
100
|
"learning_rate": 0.05,
|
|
95
|
-
"
|
|
101
|
+
"feature_fraction": 1.0,
|
|
102
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
96
103
|
}
|
|
97
104
|
|
|
98
105
|
N_FOLDS = 5
|
|
@@ -211,6 +218,14 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
211
218
|
}
|
|
212
219
|
|
|
213
220
|
|
|
221
|
+
def is_catboost_estimator(estimator):
|
|
222
|
+
try:
|
|
223
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
224
|
+
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
225
|
+
except ImportError:
|
|
226
|
+
return False
|
|
227
|
+
|
|
228
|
+
|
|
214
229
|
@dataclass
|
|
215
230
|
class _CrossValResults:
|
|
216
231
|
metric: Optional[float]
|
|
@@ -351,7 +366,7 @@ class EstimatorWrapper:
|
|
|
351
366
|
if shaps is not None:
|
|
352
367
|
for feature, shap_value in shaps.items():
|
|
353
368
|
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
354
|
-
shap_values_all_folds[feature].
|
|
369
|
+
shap_values_all_folds[feature].append(shap_value)
|
|
355
370
|
|
|
356
371
|
if shap_values_all_folds:
|
|
357
372
|
average_shap_values = {
|
|
@@ -431,17 +446,26 @@ class EstimatorWrapper:
|
|
|
431
446
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
432
447
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
433
448
|
if target_type == ModelTaskType.MULTICLASS:
|
|
434
|
-
params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
|
449
|
+
# params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
|
450
|
+
# params = _get_add_params(params, add_params)
|
|
451
|
+
# estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
452
|
+
params = _get_add_params(params, LIGHTGBM_PARAMS)
|
|
435
453
|
params = _get_add_params(params, add_params)
|
|
436
|
-
estimator =
|
|
454
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
437
455
|
elif target_type == ModelTaskType.BINARY:
|
|
438
|
-
params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
|
456
|
+
# params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
|
457
|
+
# params = _get_add_params(params, add_params)
|
|
458
|
+
# estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
459
|
+
params = _get_add_params(params, LIGHTGBM_PARAMS)
|
|
439
460
|
params = _get_add_params(params, add_params)
|
|
440
|
-
estimator =
|
|
461
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
441
462
|
elif target_type == ModelTaskType.REGRESSION:
|
|
442
|
-
params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
|
463
|
+
# params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
|
464
|
+
# params = _get_add_params(params, add_params)
|
|
465
|
+
# estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
|
466
|
+
params = _get_add_params(params, LIGHTGBM_PARAMS)
|
|
443
467
|
params = _get_add_params(params, add_params)
|
|
444
|
-
estimator =
|
|
468
|
+
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
|
445
469
|
else:
|
|
446
470
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
447
471
|
else:
|
|
@@ -450,7 +474,7 @@ class EstimatorWrapper:
|
|
|
450
474
|
else:
|
|
451
475
|
estimator_copy = deepcopy(estimator)
|
|
452
476
|
kwargs["estimator"] = estimator_copy
|
|
453
|
-
if
|
|
477
|
+
if is_catboost_estimator(estimator):
|
|
454
478
|
if cat_features is not None:
|
|
455
479
|
for cat_feature in cat_features:
|
|
456
480
|
if cat_feature not in x.columns:
|
|
@@ -458,23 +482,13 @@ class EstimatorWrapper:
|
|
|
458
482
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
459
483
|
)
|
|
460
484
|
estimator_copy.set_params(
|
|
461
|
-
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
462
485
|
cat_features=cat_features
|
|
463
486
|
)
|
|
464
487
|
estimator = CatBoostWrapper(**kwargs)
|
|
465
488
|
else:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
470
|
-
estimator = LightGBMWrapper(**kwargs)
|
|
471
|
-
else:
|
|
472
|
-
logger.warning(
|
|
473
|
-
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
474
|
-
"Default strategy for category features will be used"
|
|
475
|
-
)
|
|
476
|
-
estimator = OtherEstimatorWrapper(**kwargs)
|
|
477
|
-
except ModuleNotFoundError:
|
|
489
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
490
|
+
estimator = LightGBMWrapper(**kwargs)
|
|
491
|
+
else:
|
|
478
492
|
logger.warning(
|
|
479
493
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
480
494
|
"Default strategy for category features will be used"
|
|
@@ -487,7 +501,7 @@ class EstimatorWrapper:
|
|
|
487
501
|
class CatBoostWrapper(EstimatorWrapper):
|
|
488
502
|
def __init__(
|
|
489
503
|
self,
|
|
490
|
-
estimator
|
|
504
|
+
estimator,
|
|
491
505
|
scorer: Callable,
|
|
492
506
|
metric_name: str,
|
|
493
507
|
multiplier: int,
|
|
@@ -517,6 +531,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
517
531
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
518
532
|
|
|
519
533
|
# Find embeddings
|
|
534
|
+
from catboost import CatBoostClassifier
|
|
520
535
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
521
536
|
emb_pattern = r"(.+)_emb\d+"
|
|
522
537
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -637,8 +652,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
637
652
|
else:
|
|
638
653
|
raise e
|
|
639
654
|
|
|
640
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator
|
|
655
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
641
656
|
try:
|
|
657
|
+
from catboost import Pool
|
|
642
658
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
643
659
|
fold_pool = Pool(
|
|
644
660
|
x,
|
|
@@ -715,6 +731,34 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
715
731
|
y = correct_string_target(y)
|
|
716
732
|
return x, y, params
|
|
717
733
|
|
|
734
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
735
|
+
try:
|
|
736
|
+
import shap
|
|
737
|
+
import lightgbm as lgb
|
|
738
|
+
|
|
739
|
+
if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
|
|
740
|
+
return None
|
|
741
|
+
|
|
742
|
+
explainer = shap.TreeExplainer(estimator)
|
|
743
|
+
|
|
744
|
+
shap_values = explainer.shap_values(x)
|
|
745
|
+
|
|
746
|
+
# For classification, shap_values is returned as a list for each class
|
|
747
|
+
# Take values for the positive class
|
|
748
|
+
if isinstance(shap_values, list):
|
|
749
|
+
shap_values = shap_values[1]
|
|
750
|
+
|
|
751
|
+
# Calculate mean absolute SHAP value for each feature
|
|
752
|
+
feature_importance = {}
|
|
753
|
+
for i, col in enumerate(x.columns):
|
|
754
|
+
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
755
|
+
|
|
756
|
+
return feature_importance
|
|
757
|
+
|
|
758
|
+
except Exception as e:
|
|
759
|
+
self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
|
|
760
|
+
return None
|
|
761
|
+
|
|
718
762
|
|
|
719
763
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
720
764
|
def __init__(
|
|
@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
|
|
|
80
80
|
postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
|
|
81
81
|
multiple_search_key=Search key {} passed multiple times
|
|
82
82
|
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
83
|
+
only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
|
|
83
84
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
84
85
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
85
86
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
@@ -136,7 +137,6 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
|
136
137
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
137
138
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
138
139
|
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
|
139
|
-
missing_target_for_transform=Search contains features on target. Please add y to the call and try again
|
|
140
140
|
missing_id_column=Id column {} not found in X
|
|
141
141
|
# target validation
|
|
142
142
|
empty_target=Target is empty in all rows
|
upgini/search_task.py
CHANGED
|
@@ -168,13 +168,7 @@ class SearchTask:
|
|
|
168
168
|
for meta in self.provider_metadata_v2:
|
|
169
169
|
if meta.features_used_for_embeddings is not None:
|
|
170
170
|
features_for_transform.update(meta.features_used_for_embeddings)
|
|
171
|
-
|
|
172
|
-
features_for_transform.update(
|
|
173
|
-
c.original_name
|
|
174
|
-
for f in meta.generated_features
|
|
175
|
-
for c in f.base_columns
|
|
176
|
-
if c.ads_definition_id is None
|
|
177
|
-
)
|
|
171
|
+
|
|
178
172
|
return list(features_for_transform)
|
|
179
173
|
|
|
180
174
|
def get_shuffle_kfold(self) -> Optional[bool]:
|
upgini/utils/mstats.py
CHANGED
|
@@ -118,7 +118,7 @@ def spearmanr(
|
|
|
118
118
|
# - dof: degrees of freedom
|
|
119
119
|
# - t_stat: t-statistic
|
|
120
120
|
# - alternative: 'two-sided', 'greater', 'less'
|
|
121
|
-
def compute_t_pvalue(t_stat, dof, alternative=
|
|
121
|
+
def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
|
|
122
122
|
from scipy.stats import t
|
|
123
123
|
|
|
124
124
|
if alternative == "two-sided":
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.68a3832.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -23,11 +23,11 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
25
|
Requires-Python: <3.12,>=3.8
|
|
26
|
-
Requires-Dist: catboost>=1.0.3
|
|
27
26
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
27
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
28
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
29
|
Requires-Dist: levenshtein>=0.25.1
|
|
30
|
+
Requires-Dist: lightgbm>=4.6.0
|
|
31
31
|
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
32
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
33
33
|
Requires-Dist: psutil>=6.0.0
|
|
@@ -39,6 +39,7 @@ Requires-Dist: python-json-logger>=3.3.0
|
|
|
39
39
|
Requires-Dist: requests>=2.8.0
|
|
40
40
|
Requires-Dist: scikit-learn>=1.3.0
|
|
41
41
|
Requires-Dist: scipy>=1.10.0
|
|
42
|
+
Requires-Dist: shap>=0.44.0
|
|
42
43
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
43
44
|
Description-Content-Type: text/markdown
|
|
44
45
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=Rima-NAKm_MGtHsSB_1p736zBApReFwNHjULA-cfSyg,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,205476
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
11
|
-
upgini/search_task.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=OW2a3UWdMEkhRv7XDJvgBsc3iU6RLC5mtqvT1fLURwk,36983
|
|
11
|
+
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -16,19 +16,18 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
|
18
18
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
|
|
20
|
+
upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
|
|
21
21
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
22
22
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
|
23
23
|
upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
|
|
24
|
-
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
|
25
24
|
upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
|
|
26
25
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
|
27
|
-
upgini/autofe/timeseries/base.py,sha256
|
|
28
|
-
upgini/autofe/timeseries/cross.py,sha256=
|
|
26
|
+
upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
|
|
27
|
+
upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
|
|
29
28
|
upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
|
|
30
29
|
upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
|
|
31
|
-
upgini/autofe/timeseries/roll.py,sha256=
|
|
30
|
+
upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
|
|
32
31
|
upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
|
|
33
32
|
upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
|
|
34
33
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
39
38
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
40
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
41
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
42
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=XU5ulr5ZDQfGbFk9QdFDzl3oDMaw0eDYCPoEq3ZvIkw,27687
|
|
43
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
44
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -61,7 +60,7 @@ upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,
|
|
|
61
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
62
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
63
62
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
64
|
-
upgini/utils/mstats.py,sha256=
|
|
63
|
+
upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
|
|
65
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
66
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
67
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
@@ -71,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
|
|
|
71
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
72
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
73
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
74
|
-
upgini-1.2.
|
|
75
|
-
upgini-1.2.
|
|
76
|
-
upgini-1.2.
|
|
77
|
-
upgini-1.2.
|
|
73
|
+
upgini-1.2.68a3832.dev1.dist-info/METADATA,sha256=8eYvZ97d0FtJE5Vj6-AQOm6nNxCHrq2dMggJoF0ft7g,49151
|
|
74
|
+
upgini-1.2.68a3832.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
75
|
+
upgini-1.2.68a3832.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.68a3832.dev1.dist-info/RECORD,,
|
upgini/autofe/utils.py
DELETED
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Utility functions for autofe module.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import functools
|
|
6
|
-
from typing import Callable
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def get_pydantic_version():
|
|
10
|
-
"""
|
|
11
|
-
Get the major version of pydantic.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
int: Major version number (1 or 2)
|
|
15
|
-
"""
|
|
16
|
-
try:
|
|
17
|
-
from pydantic import __version__ as pydantic_version
|
|
18
|
-
|
|
19
|
-
major_version = int(pydantic_version.split(".")[0])
|
|
20
|
-
return major_version
|
|
21
|
-
except (ImportError, ValueError):
|
|
22
|
-
# Default to version 1 if unable to determine
|
|
23
|
-
return 1
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def pydantic_validator(field_name: str, *fields, mode: str = "before", **kwargs):
|
|
27
|
-
"""
|
|
28
|
-
A decorator that applies the appropriate Pydantic validator based on the installed version.
|
|
29
|
-
|
|
30
|
-
This decorator handles the differences between Pydantic v1 and v2 validator syntax,
|
|
31
|
-
making it easier to write code that works with both versions.
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
field_name (str): The name of the field to validate
|
|
35
|
-
mode (str): The validation mode, either "before" or "after" (for Pydantic v2)
|
|
36
|
-
**kwargs: Additional arguments to pass to the validator
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
Callable: A decorator that can be applied to validator methods
|
|
40
|
-
|
|
41
|
-
Example:
|
|
42
|
-
```python
|
|
43
|
-
class MyModel(BaseModel):
|
|
44
|
-
items: List[int]
|
|
45
|
-
|
|
46
|
-
@pydantic_validator("items")
|
|
47
|
-
def parse_items(cls, value):
|
|
48
|
-
if isinstance(value, str):
|
|
49
|
-
return [int(x) for x in value.split(",")]
|
|
50
|
-
return value
|
|
51
|
-
```
|
|
52
|
-
"""
|
|
53
|
-
pydantic_version = get_pydantic_version()
|
|
54
|
-
|
|
55
|
-
if pydantic_version >= 2:
|
|
56
|
-
# Use field_validator for Pydantic 2.x
|
|
57
|
-
from pydantic import field_validator
|
|
58
|
-
|
|
59
|
-
def decorator(func: Callable) -> Callable:
|
|
60
|
-
@field_validator(field_name, *fields, mode=mode, **kwargs)
|
|
61
|
-
@functools.wraps(func)
|
|
62
|
-
def wrapper(cls, value, **kw):
|
|
63
|
-
return func(cls, value)
|
|
64
|
-
|
|
65
|
-
return wrapper
|
|
66
|
-
|
|
67
|
-
return decorator
|
|
68
|
-
else:
|
|
69
|
-
# Use validator for Pydantic 1.x
|
|
70
|
-
from pydantic import validator
|
|
71
|
-
|
|
72
|
-
# Map mode to Pydantic v1 parameters
|
|
73
|
-
pre = True if mode == "before" else False
|
|
74
|
-
|
|
75
|
-
def decorator(func: Callable) -> Callable:
|
|
76
|
-
@validator(field_name, *fields, pre=pre, **kwargs)
|
|
77
|
-
@functools.wraps(func)
|
|
78
|
-
def wrapper(cls, value, **kw):
|
|
79
|
-
return func(cls, value)
|
|
80
|
-
|
|
81
|
-
return wrapper
|
|
82
|
-
|
|
83
|
-
return decorator
|
|
File without changes
|