upgini 1.2.60a3792.dev2__tar.gz → 1.2.62a3818.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/PKG-INFO +2 -1
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/pyproject.toml +1 -0
- upgini-1.2.62a3818.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/all_operands.py +2 -2
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/binary.py +1 -1
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/date.py +2 -2
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/feature.py +1 -1
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/groupby.py +1 -1
- upgini-1.2.60a3792.dev2/src/upgini/autofe/operand.py → upgini-1.2.62a3818.dev1/src/upgini/autofe/operator.py +2 -2
- upgini-1.2.60a3792.dev2/src/upgini/autofe/vector.py → upgini-1.2.62a3818.dev1/src/upgini/autofe/timeseries.py +3 -23
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/unary.py +1 -1
- upgini-1.2.62a3818.dev1/src/upgini/autofe/vector.py +24 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/data_source/data_source_publisher.py +9 -4
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/features_enricher.py +108 -46
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/metrics.py +4 -7
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/datetime_utils.py +2 -0
- upgini-1.2.62a3818.dev1/src/upgini/utils/mstats.py +177 -0
- upgini-1.2.62a3818.dev1/src/upgini/utils/sort.py +172 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/target_utils.py +3 -3
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/ts_utils.py +0 -6
- upgini-1.2.60a3792.dev2/src/upgini/__about__.py +0 -1
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/.gitignore +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/LICENSE +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/README.md +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/dataset.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/http.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/metadata.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.62a3818.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -30,6 +30,7 @@ Requires-Dist: jarowinkler>=2.0.0
|
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
31
|
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
32
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
33
|
+
Requires-Dist: psutil>=6.0.0
|
|
33
34
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
34
35
|
Requires-Dist: pyjwt>=2.8.0
|
|
35
36
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.62a3818.dev1"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from upgini.autofe.
|
|
1
|
+
from upgini.autofe.operator import OperatorRegistry
|
|
2
2
|
from upgini.autofe.unary import * # noqa
|
|
3
3
|
from upgini.autofe.binary import * # noqa
|
|
4
4
|
from upgini.autofe.groupby import * # noqa
|
|
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def find_op(name):
|
|
10
|
-
return
|
|
10
|
+
return OperatorRegistry.get_operand(name)
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
import json
|
|
3
|
-
from typing import
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.
|
|
10
|
+
from upgini.autofe.operator import PandasOperand, ParametrizedOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
9
|
+
class OperatorRegistry(type(BaseModel)):
|
|
10
10
|
_registry = {}
|
|
11
11
|
_parametrized_registry = []
|
|
12
12
|
|
|
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
|
|
|
46
46
|
return None
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
class Operand(BaseModel, metaclass=
|
|
49
|
+
class Operand(BaseModel, metaclass=OperatorRegistry):
|
|
50
50
|
name: Optional[str] = None
|
|
51
51
|
alias: Optional[str] = None
|
|
52
52
|
is_unary: bool = False
|
|
@@ -2,33 +2,13 @@ import abc
|
|
|
2
2
|
from typing import Dict, List, Optional
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, ParametrizedOperand
|
|
5
6
|
|
|
6
7
|
try:
|
|
7
8
|
from pydantic import field_validator as validator # V2
|
|
8
9
|
except ImportError:
|
|
9
10
|
from pydantic import validator # V1
|
|
10
11
|
|
|
11
|
-
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class Mean(PandasOperand, VectorizableMixin):
|
|
15
|
-
name: str = "mean"
|
|
16
|
-
output_type: Optional[str] = "float"
|
|
17
|
-
is_vector: bool = True
|
|
18
|
-
group_index: int = 0
|
|
19
|
-
|
|
20
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
21
|
-
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class Sum(PandasOperand, VectorizableMixin):
|
|
25
|
-
name: str = "sum"
|
|
26
|
-
is_vector: bool = True
|
|
27
|
-
group_index: int = 0
|
|
28
|
-
|
|
29
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
30
|
-
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
31
|
-
|
|
32
12
|
|
|
33
13
|
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
34
14
|
is_vector: bool = True
|
|
@@ -55,7 +35,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
|
55
35
|
ts.set_index(date.name, inplace=True)
|
|
56
36
|
ts = ts[ts.index.notna()].sort_index()
|
|
57
37
|
ts = (
|
|
58
|
-
ts.groupby([c.name for c in data[1:-1]])
|
|
38
|
+
ts.groupby([c.name for c in data[1:-1]], group_keys=True)
|
|
59
39
|
.apply(self._shift)[data[-1].name]
|
|
60
40
|
.to_frame()
|
|
61
41
|
.reset_index()
|
|
@@ -84,7 +64,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
|
84
64
|
pass
|
|
85
65
|
|
|
86
66
|
|
|
87
|
-
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
67
|
+
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
|
|
88
68
|
|
|
89
69
|
|
|
90
70
|
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
@@ -2,7 +2,7 @@ from typing import Dict, Optional
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class Abs(PandasOperand, VectorizableMixin):
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from upgini.autofe.operator import PandasOperand, VectorizableMixin
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Mean(PandasOperand, VectorizableMixin):
|
|
9
|
+
name: str = "mean"
|
|
10
|
+
output_type: Optional[str] = "float"
|
|
11
|
+
is_vector: bool = True
|
|
12
|
+
group_index: int = 0
|
|
13
|
+
|
|
14
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
15
|
+
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Sum(PandasOperand, VectorizableMixin):
|
|
19
|
+
name: str = "sum"
|
|
20
|
+
is_vector: bool = True
|
|
21
|
+
group_index: int = 0
|
|
22
|
+
|
|
23
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
|
+
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/data_source/data_source_publisher.py
RENAMED
|
@@ -63,6 +63,7 @@ class DataSourcePublisher:
|
|
|
63
63
|
keep_features: Optional[List[str]] = None,
|
|
64
64
|
date_features: Optional[List[str]] = None,
|
|
65
65
|
date_vector_features: Optional[List[str]] = None,
|
|
66
|
+
date_features_format: Optional[str] = None,
|
|
66
67
|
generate_runtime_embeddings: Optional[List[str]] = None,
|
|
67
68
|
exclude_raw: Optional[List[str]] = None,
|
|
68
69
|
_force_generation=False,
|
|
@@ -160,13 +161,17 @@ class DataSourcePublisher:
|
|
|
160
161
|
if keep_features is not None:
|
|
161
162
|
request["keepFeatures"] = keep_features
|
|
162
163
|
if date_features is not None:
|
|
163
|
-
if
|
|
164
|
-
raise ValidationError("
|
|
164
|
+
if date_features_format is None:
|
|
165
|
+
raise ValidationError("date_features_format should be presented if you use date features")
|
|
165
166
|
request["dateFeatures"] = date_features
|
|
167
|
+
request["dateFeaturesFormat"] = date_features_format
|
|
166
168
|
if date_vector_features is not None:
|
|
167
|
-
if
|
|
168
|
-
raise ValidationError(
|
|
169
|
+
if date_features_format is None:
|
|
170
|
+
raise ValidationError(
|
|
171
|
+
"date_features_format should be presented if you use date vector features"
|
|
172
|
+
)
|
|
169
173
|
request["dateVectorFeatures"] = date_vector_features
|
|
174
|
+
request["dateFeaturesFormat"] = date_features_format
|
|
170
175
|
if generate_runtime_embeddings is not None:
|
|
171
176
|
request["generateRuntimeEmbeddingsFeatures"] = generate_runtime_embeddings
|
|
172
177
|
if exclude_raw is not None:
|
|
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
|
|
|
31
31
|
from sklearn.model_selection import BaseCrossValidator
|
|
32
32
|
|
|
33
33
|
from upgini.autofe.feature import Feature
|
|
34
|
-
from upgini.autofe.
|
|
34
|
+
from upgini.autofe.timeseries import TimeSeriesBase
|
|
35
35
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
36
36
|
from upgini.dataset import Dataset
|
|
37
37
|
from upgini.errors import HttpError, ValidationError
|
|
@@ -112,6 +112,7 @@ try:
|
|
|
112
112
|
except Exception:
|
|
113
113
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
114
114
|
|
|
115
|
+
from upgini.utils.sort import sort_columns
|
|
115
116
|
from upgini.utils.target_utils import (
|
|
116
117
|
balance_undersample_forced,
|
|
117
118
|
calculate_psi,
|
|
@@ -1261,7 +1262,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1261
1262
|
for feature, shap in new_shaps.items()
|
|
1262
1263
|
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1263
1264
|
}
|
|
1264
|
-
self.__prepare_feature_importances(trace_id, x_columns, new_shaps
|
|
1265
|
+
self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
|
|
1265
1266
|
|
|
1266
1267
|
if self.features_info_display_handle is not None:
|
|
1267
1268
|
try:
|
|
@@ -1568,9 +1569,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1568
1569
|
|
|
1569
1570
|
fitting_eval_set_dict = {}
|
|
1570
1571
|
fitting_x_columns = fitting_X.columns.to_list()
|
|
1571
|
-
|
|
1572
|
+
# Idempotently sort columns
|
|
1573
|
+
fitting_x_columns = sort_columns(
|
|
1574
|
+
fitting_X, y_sorted, search_keys, self.model_task_type, sort_all_columns=True, logger=self.logger
|
|
1575
|
+
)
|
|
1576
|
+
fitting_X = fitting_X[fitting_x_columns]
|
|
1577
|
+
self.logger.info(f"Final sorted list of fitting X columns: {fitting_x_columns}")
|
|
1572
1578
|
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
|
1573
|
-
|
|
1579
|
+
fitting_enriched_x_columns = sort_columns(
|
|
1580
|
+
fitting_enriched_X,
|
|
1581
|
+
enriched_y_sorted,
|
|
1582
|
+
search_keys,
|
|
1583
|
+
self.model_task_type,
|
|
1584
|
+
sort_all_columns=True,
|
|
1585
|
+
logger=self.logger,
|
|
1586
|
+
)
|
|
1587
|
+
fitting_enriched_X = fitting_enriched_X[fitting_enriched_x_columns]
|
|
1588
|
+
self.logger.info(f"Final sorted list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
|
1574
1589
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1575
1590
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1576
1591
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
@@ -1734,11 +1749,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1734
1749
|
if eval_set is not None
|
|
1735
1750
|
else (Dataset.FIT_SAMPLE_THRESHOLD, Dataset.FIT_SAMPLE_ROWS)
|
|
1736
1751
|
)
|
|
1752
|
+
|
|
1753
|
+
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID, TARGET, columns_renaming, silent=True)
|
|
1754
|
+
# Sample after sorting by system_record_id for idempotency
|
|
1755
|
+
df.sort_values(by=SYSTEM_RECORD_ID, inplace=True)
|
|
1756
|
+
|
|
1737
1757
|
if num_samples > sample_threshold:
|
|
1738
1758
|
self.logger.info(f"Downsampling from {num_samples} to {sample_rows}")
|
|
1739
1759
|
df = df.sample(n=sample_rows, random_state=self.random_state)
|
|
1740
1760
|
|
|
1741
|
-
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
1742
1761
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
1743
1762
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
1744
1763
|
|
|
@@ -1882,6 +1901,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1882
1901
|
and self.columns_for_online_api is not None
|
|
1883
1902
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1884
1903
|
)
|
|
1904
|
+
# TODO: check that system_record_id was added before this step
|
|
1885
1905
|
if force_downsampling:
|
|
1886
1906
|
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1887
1907
|
df = balance_undersample_forced(
|
|
@@ -1915,6 +1935,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1915
1935
|
progress_bar=progress_bar,
|
|
1916
1936
|
progress_callback=progress_callback,
|
|
1917
1937
|
add_fit_system_record_id=True,
|
|
1938
|
+
target_name=tmp_target_name,
|
|
1918
1939
|
)
|
|
1919
1940
|
if enriched_df is None:
|
|
1920
1941
|
return None
|
|
@@ -1953,6 +1974,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1953
1974
|
and self.columns_for_online_api is not None
|
|
1954
1975
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1955
1976
|
)
|
|
1977
|
+
|
|
1956
1978
|
if force_downsampling:
|
|
1957
1979
|
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1958
1980
|
df = balance_undersample_forced(
|
|
@@ -1984,6 +2006,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1984
2006
|
progress_bar=progress_bar,
|
|
1985
2007
|
progress_callback=progress_callback,
|
|
1986
2008
|
add_fit_system_record_id=True,
|
|
2009
|
+
target_name=tmp_target_name,
|
|
1987
2010
|
)
|
|
1988
2011
|
if enriched_Xy is None:
|
|
1989
2012
|
return None
|
|
@@ -2145,6 +2168,7 @@ if response.status_code == 200:
|
|
|
2145
2168
|
progress_bar: Optional[ProgressBar] = None,
|
|
2146
2169
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
2147
2170
|
add_fit_system_record_id: bool = False,
|
|
2171
|
+
target_name: Optional[str] = None,
|
|
2148
2172
|
) -> Tuple[pd.DataFrame, Dict[str, str], List[str]]:
|
|
2149
2173
|
if self._search_task is None:
|
|
2150
2174
|
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
@@ -2329,8 +2353,16 @@ if response.status_code == 200:
|
|
|
2329
2353
|
and c not in [ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
2330
2354
|
]
|
|
2331
2355
|
|
|
2332
|
-
if add_fit_system_record_id:
|
|
2333
|
-
|
|
2356
|
+
if add_fit_system_record_id and target_name is not None:
|
|
2357
|
+
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
2358
|
+
df = self.__add_fit_system_record_id(
|
|
2359
|
+
df,
|
|
2360
|
+
search_keys,
|
|
2361
|
+
SYSTEM_RECORD_ID,
|
|
2362
|
+
reversed_columns_renaming.get(target_name, target_name),
|
|
2363
|
+
columns_renaming,
|
|
2364
|
+
silent=True,
|
|
2365
|
+
)
|
|
2334
2366
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2335
2367
|
features_not_to_pass.append(SORT_ID)
|
|
2336
2368
|
|
|
@@ -2775,7 +2807,9 @@ if response.status_code == 200:
|
|
|
2775
2807
|
self.__log_warning(full_duplicates_warning)
|
|
2776
2808
|
|
|
2777
2809
|
# Explode multiple search keys
|
|
2778
|
-
df = self.__add_fit_system_record_id(
|
|
2810
|
+
df = self.__add_fit_system_record_id(
|
|
2811
|
+
df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
|
|
2812
|
+
)
|
|
2779
2813
|
|
|
2780
2814
|
# TODO check that this is correct for enrichment
|
|
2781
2815
|
self.df_with_original_index = df.copy()
|
|
@@ -2857,7 +2891,9 @@ if response.status_code == 200:
|
|
|
2857
2891
|
if eval_set is not None and len(eval_set) > 0:
|
|
2858
2892
|
meaning_types[EVAL_SET_INDEX] = FileColumnMeaningType.EVAL_SET_INDEX
|
|
2859
2893
|
|
|
2860
|
-
df = self.__add_fit_system_record_id(
|
|
2894
|
+
df = self.__add_fit_system_record_id(
|
|
2895
|
+
df, self.fit_search_keys, SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming, silent=True
|
|
2896
|
+
)
|
|
2861
2897
|
|
|
2862
2898
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2863
2899
|
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
@@ -3544,56 +3580,82 @@ if response.status_code == 200:
|
|
|
3544
3580
|
def __add_fit_system_record_id(
|
|
3545
3581
|
self,
|
|
3546
3582
|
df: pd.DataFrame,
|
|
3547
|
-
# meaning_types: Dict[str, FileColumnMeaningType],
|
|
3548
3583
|
search_keys: Dict[str, SearchKey],
|
|
3549
3584
|
id_name: str,
|
|
3585
|
+
target_name: str,
|
|
3586
|
+
columns_renaming: Dict[str, str],
|
|
3587
|
+
silent: bool = False,
|
|
3550
3588
|
) -> pd.DataFrame:
|
|
3551
|
-
# save original order or rows
|
|
3552
3589
|
original_index_name = df.index.name
|
|
3553
3590
|
index_name = df.index.name or DEFAULT_INDEX
|
|
3554
3591
|
original_order_name = "original_order"
|
|
3592
|
+
# Save original index
|
|
3555
3593
|
df = df.reset_index().rename(columns={index_name: ORIGINAL_INDEX})
|
|
3594
|
+
# Save original order
|
|
3556
3595
|
df = df.reset_index().rename(columns={DEFAULT_INDEX: original_order_name})
|
|
3557
3596
|
|
|
3558
|
-
# order by date and idempotent order by other keys
|
|
3559
|
-
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
3560
|
-
sort_exclude_columns = [
|
|
3561
|
-
original_order_name,
|
|
3562
|
-
ORIGINAL_INDEX,
|
|
3563
|
-
EVAL_SET_INDEX,
|
|
3564
|
-
TARGET,
|
|
3565
|
-
"__target",
|
|
3566
|
-
ENTITY_SYSTEM_RECORD_ID,
|
|
3567
|
-
]
|
|
3568
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3569
|
-
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3570
|
-
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
3571
|
-
else:
|
|
3572
|
-
date_column = self._get_date_column(search_keys)
|
|
3573
|
-
sort_columns = [date_column] if date_column is not None else []
|
|
3597
|
+
# order by date and idempotent order by other keys and features
|
|
3574
3598
|
|
|
3575
|
-
|
|
3576
|
-
|
|
3599
|
+
sort_exclude_columns = [
|
|
3600
|
+
original_order_name,
|
|
3601
|
+
ORIGINAL_INDEX,
|
|
3602
|
+
EVAL_SET_INDEX,
|
|
3603
|
+
TARGET,
|
|
3604
|
+
"__target",
|
|
3605
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
3606
|
+
]
|
|
3607
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
3608
|
+
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
3609
|
+
sort_exclude_columns.append(FeaturesEnricher._get_date_column(search_keys))
|
|
3610
|
+
else:
|
|
3611
|
+
date_column = FeaturesEnricher._get_date_column(search_keys)
|
|
3612
|
+
sort_exclude_columns.append(date_column)
|
|
3613
|
+
columns_to_sort = [date_column] if date_column is not None else []
|
|
3614
|
+
|
|
3615
|
+
do_sorting = True
|
|
3616
|
+
if self.id_columns and self.cv in [CVType.time_series, CVType.blocked_time_series]:
|
|
3617
|
+
# Check duplicates by date and id_columns
|
|
3618
|
+
reversed_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
|
3619
|
+
renamed_id_columns = [reversed_columns_renaming.get(c, c) for c in self.id_columns]
|
|
3620
|
+
duplicate_check_columns = [c for c in renamed_id_columns if c in df.columns]
|
|
3621
|
+
if date_column is not None:
|
|
3622
|
+
duplicate_check_columns.append(date_column)
|
|
3577
3623
|
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3624
|
+
duplicates = df.duplicated(subset=duplicate_check_columns, keep=False)
|
|
3625
|
+
if duplicates.any():
|
|
3626
|
+
if not silent:
|
|
3627
|
+
self.__log_warning(self.bundle.get("date_and_id_columns_duplicates").format(duplicates.sum()))
|
|
3628
|
+
else:
|
|
3629
|
+
self.logger.warning(
|
|
3630
|
+
f"Found {duplicates.sum()} duplicate rows by date and ID columns: {duplicate_check_columns}."
|
|
3631
|
+
" Will not sort dataset"
|
|
3632
|
+
)
|
|
3633
|
+
do_sorting = False
|
|
3634
|
+
else:
|
|
3635
|
+
columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
|
|
3636
|
+
columns_to_hash = sort_columns(
|
|
3637
|
+
df[columns_to_hash],
|
|
3638
|
+
target_name,
|
|
3639
|
+
search_keys,
|
|
3640
|
+
self.model_task_type,
|
|
3641
|
+
sort_exclude_columns,
|
|
3642
|
+
logger=self.logger,
|
|
3643
|
+
)
|
|
3644
|
+
else:
|
|
3645
|
+
columns_to_hash = sort_columns(
|
|
3646
|
+
df, target_name, search_keys, self.model_task_type, sort_exclude_columns, logger=self.logger
|
|
3587
3647
|
)
|
|
3588
|
-
|
|
3589
|
-
all_other_columns = sorted_other_keys + other_columns
|
|
3590
|
-
|
|
3648
|
+
if do_sorting:
|
|
3591
3649
|
search_keys_hash = "search_keys_hash"
|
|
3592
|
-
if len(
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3650
|
+
if len(columns_to_hash) > 0:
|
|
3651
|
+
factorized_df = df.copy()
|
|
3652
|
+
for col in columns_to_hash:
|
|
3653
|
+
if col not in search_keys and not is_numeric_dtype(factorized_df[col]):
|
|
3654
|
+
factorized_df[col] = factorized_df[col].factorize(sort=True)[0]
|
|
3655
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(factorized_df[columns_to_hash], index=False)
|
|
3656
|
+
columns_to_sort.append(search_keys_hash)
|
|
3657
|
+
|
|
3658
|
+
df = df.sort_values(by=columns_to_sort)
|
|
3597
3659
|
|
|
3598
3660
|
if search_keys_hash in df.columns:
|
|
3599
3661
|
df.drop(columns=search_keys_hash, inplace=True)
|
|
@@ -30,8 +30,8 @@ except ImportError:
|
|
|
30
30
|
from sklearn.metrics._regression import (
|
|
31
31
|
_check_reg_targets,
|
|
32
32
|
check_consistent_length,
|
|
33
|
-
mean_squared_error,
|
|
34
33
|
)
|
|
34
|
+
from sklearn.metrics import mean_squared_error
|
|
35
35
|
from sklearn.model_selection import BaseCrossValidator
|
|
36
36
|
|
|
37
37
|
from upgini.errors import ValidationError
|
|
@@ -289,9 +289,6 @@ class EstimatorWrapper:
|
|
|
289
289
|
else:
|
|
290
290
|
x, y = self._remove_empty_target_rows(x, y)
|
|
291
291
|
|
|
292
|
-
# Make order of columns idempotent
|
|
293
|
-
x = x[sorted(x.columns)]
|
|
294
|
-
|
|
295
292
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
296
293
|
return x, y, groups
|
|
297
294
|
|
|
@@ -569,7 +566,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
569
566
|
if all([isinstance(c, int) for c in estimator_cat_features]):
|
|
570
567
|
cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
|
|
571
568
|
cat_features_idx.update(estimator_cat_features)
|
|
572
|
-
self.cat_features = [x.columns[idx] for idx in
|
|
569
|
+
self.cat_features = [x.columns[idx] for idx in cat_features_idx]
|
|
573
570
|
elif all([isinstance(c, str) for c in estimator_cat_features]):
|
|
574
571
|
self.cat_features = list(set(self.cat_features + estimator_cat_features))
|
|
575
572
|
else:
|
|
@@ -940,13 +937,13 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
|
|
|
940
937
|
if (y_true < 0).any():
|
|
941
938
|
raise ValidationError(bundle.get("metrics_msle_negative_target"))
|
|
942
939
|
|
|
943
|
-
|
|
940
|
+
mse = mean_squared_error(
|
|
944
941
|
log1p(y_true),
|
|
945
942
|
log1p(y_pred.clip(0)),
|
|
946
943
|
sample_weight=sample_weight,
|
|
947
944
|
multioutput=multioutput,
|
|
948
|
-
squared=squared,
|
|
949
945
|
)
|
|
946
|
+
return mse if squared else np.sqrt(mse)
|
|
950
947
|
|
|
951
948
|
|
|
952
949
|
def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/strings.properties
RENAMED
|
@@ -35,6 +35,7 @@ trial_quota_limit_riched=You have reached the quota limit of trial data usage. P
|
|
|
35
35
|
loss_selection_warn=Loss `{0}` is not supported for feature selection with {1}
|
|
36
36
|
loss_calc_metrics_warn=Loss `{0}` is not supported for metrics calculation with {1}
|
|
37
37
|
multivariate_timeseries_detected=Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
38
|
+
date_and_id_columns_duplicates=Found {} duplicate rows by date and id_columns
|
|
38
39
|
group_k_fold_in_classification=Using group K-fold cross-validation split for classification task.
|
|
39
40
|
current_date_added=No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
40
41
|
# Errors
|
|
@@ -166,6 +166,8 @@ class DateTimeSearchKeyConverter:
|
|
|
166
166
|
|
|
167
167
|
# Drop intermediate columns if not needed
|
|
168
168
|
df.drop(columns=["second", "minute", "hour"], inplace=True)
|
|
169
|
+
else:
|
|
170
|
+
keep_time = False
|
|
169
171
|
|
|
170
172
|
for generated_feature in self.generated_features[:]:
|
|
171
173
|
if df[generated_feature].dropna().nunique() <= 1:
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import numpy.ma as ma
|
|
6
|
+
import scipy
|
|
7
|
+
from joblib import Parallel, delayed
|
|
8
|
+
from numpy import ndarray
|
|
9
|
+
from psutil import cpu_count
|
|
10
|
+
|
|
11
|
+
np.seterr(divide="ignore")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
warnings.simplefilter(action="ignore", category=RuntimeWarning)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _find_repeats(arr):
|
|
18
|
+
# This function assumes it may clobber its input.
|
|
19
|
+
if len(arr) == 0:
|
|
20
|
+
return np.array(0, np.float64), np.array(0, np.intp)
|
|
21
|
+
|
|
22
|
+
# XXX This cast was previously needed for the Fortran implementation,
|
|
23
|
+
# should we ditch it?
|
|
24
|
+
arr = np.asarray(arr, np.float64).ravel()
|
|
25
|
+
arr.sort()
|
|
26
|
+
|
|
27
|
+
# Taken from NumPy 1.9's np.unique.
|
|
28
|
+
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
|
29
|
+
unique = arr[change]
|
|
30
|
+
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
|
31
|
+
freq = np.diff(change_idx)
|
|
32
|
+
atleast2 = freq > 1
|
|
33
|
+
return unique[atleast2], freq[atleast2]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def find_repeats(arr):
|
|
37
|
+
# Make sure we get a copy. ma.compressed promises a "new array", but can
|
|
38
|
+
# actually return a reference.
|
|
39
|
+
compr = np.asarray(ma.compressed(arr), dtype=np.float64)
|
|
40
|
+
try:
|
|
41
|
+
need_copy = np.may_share_memory(compr, arr)
|
|
42
|
+
except AttributeError:
|
|
43
|
+
# numpy < 1.8.2 bug: np.may_share_memory([], []) raises,
|
|
44
|
+
# while in numpy 1.8.2 and above it just (correctly) returns False.
|
|
45
|
+
need_copy = False
|
|
46
|
+
if need_copy:
|
|
47
|
+
compr = compr.copy()
|
|
48
|
+
return _find_repeats(compr)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def rankdata(data, axis=None, use_missing=False):
|
|
52
|
+
def _rank1d(data, use_missing=False):
|
|
53
|
+
n = data.count()
|
|
54
|
+
rk = np.empty(data.size, dtype=float)
|
|
55
|
+
idx = data.argsort()
|
|
56
|
+
rk[idx[:n]] = np.arange(1, n + 1)
|
|
57
|
+
|
|
58
|
+
if use_missing:
|
|
59
|
+
rk[idx[n:]] = (n + 1) / 2.0
|
|
60
|
+
else:
|
|
61
|
+
rk[idx[n:]] = 0
|
|
62
|
+
|
|
63
|
+
repeats = find_repeats(data.copy())
|
|
64
|
+
for r in repeats[0]:
|
|
65
|
+
condition = (data == r).filled(False)
|
|
66
|
+
rk[condition] = rk[condition].mean()
|
|
67
|
+
return rk
|
|
68
|
+
|
|
69
|
+
data = ma.array(data, copy=False)
|
|
70
|
+
if axis is None:
|
|
71
|
+
if data.ndim > 1:
|
|
72
|
+
return _rank1d(data.ravel(), use_missing).reshape(data.shape)
|
|
73
|
+
else:
|
|
74
|
+
return _rank1d(data, use_missing)
|
|
75
|
+
else:
|
|
76
|
+
return ma.apply_along_axis(_rank1d, axis, data, use_missing).view(ndarray)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _chk_asarray(a, axis):
|
|
80
|
+
# Always returns a masked array, raveled for axis=None
|
|
81
|
+
a = ma.asanyarray(a)
|
|
82
|
+
if axis is None:
|
|
83
|
+
a = ma.ravel(a)
|
|
84
|
+
outaxis = 0
|
|
85
|
+
else:
|
|
86
|
+
outaxis = axis
|
|
87
|
+
return a, outaxis
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
SpearmanrResult = namedtuple("SpearmanrResult", ("correlation", "pvalue"))
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Taken from scipy.mstats with following tweaks:
|
|
94
|
+
# 1. parallel pairwise computation
|
|
95
|
+
# 2. custom masking
|
|
96
|
+
def spearmanr(
|
|
97
|
+
x, y=None, use_ties=True, axis=None, nan_policy="propagate", alternative="two-sided", mask_fn=ma.masked_invalid
|
|
98
|
+
):
|
|
99
|
+
if not use_ties:
|
|
100
|
+
raise ValueError("`use_ties=False` is not supported in SciPy >= 1.2.0")
|
|
101
|
+
|
|
102
|
+
# Always returns a masked array, raveled if axis=None
|
|
103
|
+
x, axisout = _chk_asarray(x, axis)
|
|
104
|
+
if y is not None:
|
|
105
|
+
# Deal only with 2-D `x` case.
|
|
106
|
+
y, _ = _chk_asarray(y, axis)
|
|
107
|
+
if axisout == 0:
|
|
108
|
+
x = ma.column_stack((x, y))
|
|
109
|
+
else:
|
|
110
|
+
x = ma.row_stack((x, y))
|
|
111
|
+
|
|
112
|
+
if axisout == 1:
|
|
113
|
+
# To simplify the code that follow (always use `n_obs, n_vars` shape)
|
|
114
|
+
x = x.T
|
|
115
|
+
|
|
116
|
+
if nan_policy == "omit":
|
|
117
|
+
x = mask_fn(x)
|
|
118
|
+
|
|
119
|
+
def _spearmanr_2cols(x):
|
|
120
|
+
# Mask the same observations for all variables, and then drop those
|
|
121
|
+
# observations (can't leave them masked, rankdata is weird).
|
|
122
|
+
x = ma.mask_rowcols(x, axis=0)
|
|
123
|
+
x = x[~x.mask.any(axis=1), :]
|
|
124
|
+
|
|
125
|
+
# If either column is entirely NaN or Inf
|
|
126
|
+
if not np.any(x.data):
|
|
127
|
+
return SpearmanrResult(np.nan, np.nan)
|
|
128
|
+
|
|
129
|
+
m = ma.getmask(x)
|
|
130
|
+
n_obs = x.shape[0]
|
|
131
|
+
dof = n_obs - 2 - int(m.sum(axis=0)[0])
|
|
132
|
+
if dof < 0:
|
|
133
|
+
return SpearmanrResult(np.nan, np.nan)
|
|
134
|
+
|
|
135
|
+
# Gets the ranks and rank differences
|
|
136
|
+
x_ranked = rankdata(x, axis=0)
|
|
137
|
+
rs = ma.corrcoef(x_ranked, rowvar=False).data
|
|
138
|
+
|
|
139
|
+
# rs can have elements equal to 1, so avoid zero division warnings
|
|
140
|
+
with np.errstate(divide="ignore"):
|
|
141
|
+
# clip the small negative values possibly caused by rounding
|
|
142
|
+
# errors before taking the square root
|
|
143
|
+
t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
|
|
144
|
+
|
|
145
|
+
t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
|
|
146
|
+
|
|
147
|
+
# For backwards compatibility, return scalars when comparing 2 columns
|
|
148
|
+
if rs.shape == (2, 2):
|
|
149
|
+
return SpearmanrResult(rs[1, 0], prob[1, 0])
|
|
150
|
+
else:
|
|
151
|
+
return SpearmanrResult(rs, prob)
|
|
152
|
+
|
|
153
|
+
# Need to do this per pair of variables, otherwise the dropped observations
|
|
154
|
+
# in a third column mess up the result for a pair.
|
|
155
|
+
n_vars = x.shape[1]
|
|
156
|
+
if n_vars == 2:
|
|
157
|
+
return _spearmanr_2cols(x)
|
|
158
|
+
else:
|
|
159
|
+
max_cpu_cores = cpu_count(logical=False)
|
|
160
|
+
with np.errstate(divide="ignore"):
|
|
161
|
+
results = Parallel(n_jobs=max_cpu_cores)(
|
|
162
|
+
delayed(_spearmanr_2cols)(x[:, [var1, var2]])
|
|
163
|
+
for var1 in range(n_vars - 1)
|
|
164
|
+
for var2 in range(var1 + 1, n_vars)
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
rs = np.ones((n_vars, n_vars), dtype=float)
|
|
168
|
+
prob = np.zeros((n_vars, n_vars), dtype=float)
|
|
169
|
+
for var1 in range(n_vars - 1):
|
|
170
|
+
for var2 in range(var1 + 1, n_vars):
|
|
171
|
+
result = results.pop(0)
|
|
172
|
+
rs[var1, var2] = result.correlation
|
|
173
|
+
rs[var2, var1] = result.correlation
|
|
174
|
+
prob[var1, var2] = result.pvalue
|
|
175
|
+
prob[var2, var1] = result.pvalue
|
|
176
|
+
|
|
177
|
+
return SpearmanrResult(rs, prob)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import logging
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from joblib import Parallel, delayed
|
|
8
|
+
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
|
|
9
|
+
from psutil import cpu_count
|
|
10
|
+
from scipy.stats import skew, spearmanr
|
|
11
|
+
|
|
12
|
+
from upgini.metadata import ModelTaskType, SearchKey
|
|
13
|
+
from upgini.utils import mstats
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def sort_columns(
|
|
17
|
+
df: pd.DataFrame,
|
|
18
|
+
target_column: Union[str, pd.Series],
|
|
19
|
+
search_keys: Dict[str, SearchKey],
|
|
20
|
+
model_task_type: ModelTaskType,
|
|
21
|
+
exclude_columns: Optional[List[str]] = None,
|
|
22
|
+
sort_all_columns: bool = False,
|
|
23
|
+
logger: Optional[logging.Logger] = None,
|
|
24
|
+
) -> List[str]:
|
|
25
|
+
if exclude_columns is None:
|
|
26
|
+
exclude_columns = []
|
|
27
|
+
if logger is None:
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
logger.setLevel(logging.FATAL)
|
|
30
|
+
df = df.copy() # avoid side effects
|
|
31
|
+
|
|
32
|
+
# Check multiple search keys
|
|
33
|
+
search_key_values = list(search_keys.values())
|
|
34
|
+
has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
|
|
35
|
+
if has_duplicate_search_keys:
|
|
36
|
+
logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
|
|
37
|
+
|
|
38
|
+
sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
|
|
39
|
+
sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
|
|
40
|
+
|
|
41
|
+
other_columns = sorted(
|
|
42
|
+
[
|
|
43
|
+
c
|
|
44
|
+
for c in df.columns
|
|
45
|
+
if c not in sorted_keys and c not in exclude_columns and (df[c].nunique() > 1 or sort_all_columns)
|
|
46
|
+
]
|
|
47
|
+
)
|
|
48
|
+
target = target_column if isinstance(target_column, pd.Series) else df[target_column]
|
|
49
|
+
target = prepare_target(target, model_task_type)
|
|
50
|
+
sort_dict = get_sort_columns_dict(
|
|
51
|
+
df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
|
|
52
|
+
)
|
|
53
|
+
other_columns = [c for c in other_columns if c in sort_dict]
|
|
54
|
+
columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
|
|
55
|
+
return columns_for_sort
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def get_sort_columns_dict(
|
|
59
|
+
df: pd.DataFrame,
|
|
60
|
+
target: pd.Series,
|
|
61
|
+
sorted_keys: List[str],
|
|
62
|
+
omit_nan: bool,
|
|
63
|
+
n_jobs: Optional[int] = None,
|
|
64
|
+
sort_all_columns: bool = False,
|
|
65
|
+
) -> Dict[str, Any]:
|
|
66
|
+
string_features = [c for c in df.select_dtypes(exclude=[np.number]).columns if c not in sorted_keys]
|
|
67
|
+
columns_for_sort = [c for c in df.columns if c not in sorted_keys + string_features]
|
|
68
|
+
if len(string_features) > 0:
|
|
69
|
+
if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
|
|
70
|
+
# factorize string features
|
|
71
|
+
for c in string_features:
|
|
72
|
+
df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
|
|
73
|
+
columns_for_sort.extend(string_features)
|
|
74
|
+
|
|
75
|
+
if len(columns_for_sort) == 0:
|
|
76
|
+
return {}
|
|
77
|
+
|
|
78
|
+
df = df[columns_for_sort]
|
|
79
|
+
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
80
|
+
df = np.asarray(df, dtype=np.float32)
|
|
81
|
+
correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
|
|
82
|
+
|
|
83
|
+
sort_dict = {col: (corr, h) for col, corr, h in zip(columns_for_sort, correlations, hashes)}
|
|
84
|
+
return sort_dict
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_sort_columns_correlations(df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None):
|
|
88
|
+
target_correlations = get_target_correlations(df, target, omit_nan, n_jobs, precision=7)
|
|
89
|
+
|
|
90
|
+
return np.max(target_correlations, axis=0)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def get_target_correlations(
|
|
94
|
+
df: np.ndarray, target: pd.Series, omit_nan: bool, n_jobs: Optional[int] = None, precision: int = 15
|
|
95
|
+
):
|
|
96
|
+
df = np.asarray(df, dtype=np.float32)
|
|
97
|
+
target_correlations = np.zeros((2, df.shape[1]))
|
|
98
|
+
target_correlations[0, :] = np.nan_to_num(
|
|
99
|
+
calculate_spearman_corr_with_target(df, target, omit_nan, n_jobs), copy=False
|
|
100
|
+
)
|
|
101
|
+
target_correlations[1, :] = np.nan_to_num(np.abs(np.corrcoef(df.T, target.T, rowvar=True)[-1, :-1]))
|
|
102
|
+
|
|
103
|
+
target_correlations = np.trunc(target_correlations * 10**precision) / (10**precision)
|
|
104
|
+
|
|
105
|
+
return target_correlations
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def calculate_spearman_corr_with_target(
|
|
109
|
+
X: Union[pd.DataFrame, np.ndarray], y: pd.Series, omit_nan: bool = False, n_jobs: Optional[int] = None
|
|
110
|
+
) -> np.ndarray:
|
|
111
|
+
if isinstance(X, pd.DataFrame):
|
|
112
|
+
X = np.asarray(X, dtype=np.float32)
|
|
113
|
+
|
|
114
|
+
if X.size == 0:
|
|
115
|
+
return np.ndarray(shape=(0,))
|
|
116
|
+
|
|
117
|
+
all_correlations = np.zeros(X.shape[1])
|
|
118
|
+
all_correlations.fill(np.nan)
|
|
119
|
+
cols2calc = np.where([c.size > 0 and not (c == c[0]).all() for c in X.T])[0]
|
|
120
|
+
|
|
121
|
+
if omit_nan:
|
|
122
|
+
results = Parallel(n_jobs=n_jobs or cpu_count(logical=False))(
|
|
123
|
+
delayed(mstats.spearmanr)(
|
|
124
|
+
X[:, i],
|
|
125
|
+
y,
|
|
126
|
+
nan_policy="omit",
|
|
127
|
+
axis=0,
|
|
128
|
+
)
|
|
129
|
+
for i in cols2calc
|
|
130
|
+
)
|
|
131
|
+
target_correlations = np.array([abs(res.correlation) for res in results])
|
|
132
|
+
else:
|
|
133
|
+
cols2calc = cols2calc[np.where(~np.isnan(X[:, cols2calc]).any(axis=0))[0]]
|
|
134
|
+
target_correlations = calculate_spearman(X[:, cols2calc], y, nan_policy="raise")
|
|
135
|
+
if isinstance(target_correlations, float):
|
|
136
|
+
target_correlations = np.abs([target_correlations])
|
|
137
|
+
else:
|
|
138
|
+
target_correlations = np.abs(target_correlations)[-1, :-1]
|
|
139
|
+
|
|
140
|
+
all_correlations[cols2calc] = target_correlations
|
|
141
|
+
|
|
142
|
+
return all_correlations
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def calculate_spearman(X: np.ndarray, y: Optional[pd.Series], nan_policy: str):
|
|
146
|
+
features_num = X.shape[1]
|
|
147
|
+
if y is not None:
|
|
148
|
+
features_num += 1
|
|
149
|
+
|
|
150
|
+
if features_num < 2:
|
|
151
|
+
return 1.0
|
|
152
|
+
else:
|
|
153
|
+
return spearmanr(X, y, nan_policy=nan_policy).correlation
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def hash_series(series: pd.Series) -> int:
|
|
157
|
+
return int(hashlib.sha256(pd.util.hash_pandas_object(series, index=True).values).hexdigest(), 16)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def prepare_target(target: pd.Series, model_task_type: ModelTaskType) -> pd.Series:
|
|
161
|
+
target_name = target.name
|
|
162
|
+
if model_task_type != ModelTaskType.REGRESSION or (
|
|
163
|
+
not is_numeric_dtype(target) and not is_datetime64_any_dtype(target)
|
|
164
|
+
):
|
|
165
|
+
target = target.astype(str).astype("category").cat.codes
|
|
166
|
+
|
|
167
|
+
elif model_task_type == ModelTaskType.REGRESSION:
|
|
168
|
+
skewness = round(abs(skew(target)), 2)
|
|
169
|
+
if (target.min() >= 0) and (skewness >= 0.9):
|
|
170
|
+
target = np.log1p(target)
|
|
171
|
+
|
|
172
|
+
return pd.Series(target, name=target_name)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import itertools
|
|
2
1
|
import logging
|
|
3
2
|
from typing import Callable, List, Optional, Union
|
|
4
3
|
|
|
@@ -208,7 +207,7 @@ def balance_undersample_forced(
|
|
|
208
207
|
id_columns: List[str],
|
|
209
208
|
date_column: str,
|
|
210
209
|
task_type: ModelTaskType,
|
|
211
|
-
cv_type: CVType
|
|
210
|
+
cv_type: Optional[CVType],
|
|
212
211
|
random_state: int,
|
|
213
212
|
sample_size: int = 7000,
|
|
214
213
|
logger: Optional[logging.Logger] = None,
|
|
@@ -372,7 +371,8 @@ def balance_undersample_time_series(
|
|
|
372
371
|
if len(id_counts) < min_different_ids:
|
|
373
372
|
if logger is not None:
|
|
374
373
|
logger.info(
|
|
375
|
-
f"Different ids count {len(id_counts)} for sample size {sample_size}
|
|
374
|
+
f"Different ids count {len(id_counts)} for sample size {sample_size}"
|
|
375
|
+
f" is less than min different ids {min_different_ids}, sampling time window"
|
|
376
376
|
)
|
|
377
377
|
date_counts = df.groupby(id_columns)[date_column].nunique().sort_values(ascending=False)
|
|
378
378
|
ids_to_sample = date_counts.index[:min_different_ids] if len(id_counts) > 0 else date_counts.index
|
|
@@ -8,23 +8,17 @@ def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_co
|
|
|
8
8
|
def closest_unit(diff):
|
|
9
9
|
return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
|
|
10
10
|
|
|
11
|
-
# Calculate differences for each ID group
|
|
12
11
|
all_diffs = []
|
|
13
12
|
groups = df.groupby(id_columns) if id_columns else [(None, df)]
|
|
14
13
|
for _, group in groups:
|
|
15
|
-
# Get sorted dates for this group
|
|
16
14
|
group_dates = group[date_column].sort_values().unique()
|
|
17
15
|
if len(group_dates) > 1:
|
|
18
|
-
# Calculate time differences between consecutive dates
|
|
19
16
|
diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
|
|
20
|
-
# Convert to nanoseconds
|
|
21
17
|
diff_ns = diff_series.dt.total_seconds()
|
|
22
18
|
all_diffs.extend(diff_ns)
|
|
23
19
|
|
|
24
|
-
# Convert to series for easier processing
|
|
25
20
|
all_diffs = pd.Series(all_diffs)
|
|
26
21
|
|
|
27
|
-
# Get most common time unit across all groups
|
|
28
22
|
most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
|
|
29
23
|
|
|
30
24
|
return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.60a3792.dev2"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/ads_management/ads_manager.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/normalizer/normalize_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/exceptions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/random_under_sampler.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/base_search_key_detector.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.60a3792.dev2 → upgini-1.2.62a3818.dev1}/src/upgini/utils/fallback_progress_bar.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|