upgini 1.1.274a4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -0
- upgini/ads.py +6 -2
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +4 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +9 -2
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +20 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +7 -6
- upgini/errors.py +1 -1
- upgini/features_enricher.py +44 -25
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +3 -0
- upgini/metrics.py +110 -97
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +1 -1
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +3 -2
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +2 -2
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +25 -19
- upgini/utils/email_utils.py +3 -3
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/features_validator.py +2 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +27 -15
- upgini/version_validator.py +2 -2
- {upgini-1.1.274a4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
- upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
- {upgini-1.1.274a4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
- upgini/fingerprint.js +0 -8
- upgini-1.1.274a4.dist-info/RECORD +0 -63
- upgini-1.1.274a4.dist-info/top_level.txt +0 -1
- {upgini-1.1.274a4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0
upgini/__about__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.280a3418-2"
|
upgini/ads.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pandas.api.types import is_string_dtype
|
|
8
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
9
9
|
|
|
10
10
|
from upgini import SearchKey
|
|
11
11
|
from upgini.http import get_rest_client
|
|
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
|
|
|
34
34
|
if df[column_name].notnull().sum() < min_valid_rows_count:
|
|
35
35
|
raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
|
|
36
36
|
meaning_type = search_keys[column_name].value
|
|
37
|
-
if
|
|
37
|
+
if (
|
|
38
|
+
meaning_type == FileColumnMeaningType.MSISDN
|
|
39
|
+
and not is_string_dtype(df[column_name])
|
|
40
|
+
and not is_object_dtype(df[column_name])
|
|
41
|
+
):
|
|
38
42
|
df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
|
|
39
43
|
else:
|
|
40
44
|
meaning_type = FileColumnMeaningType.FEATURE
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
|
|
2
4
|
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
5
|
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
4
6
|
from upgini.autofe.operand import Operand
|
|
5
|
-
from upgini.autofe.unary import Abs,
|
|
6
|
-
from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
|
|
7
|
+
from upgini.autofe.unary import Abs, Bin, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
|
|
7
8
|
from upgini.autofe.vector import Mean, Sum
|
|
8
9
|
|
|
9
10
|
ALL_OPERANDS: Dict[str, Operand] = {
|
|
@@ -48,6 +49,7 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
48
49
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
49
50
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
50
51
|
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
52
|
+
Bin(),
|
|
51
53
|
]
|
|
52
54
|
}
|
|
53
55
|
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
from numpy import dot
|
|
5
4
|
from numpy.linalg import norm
|
|
6
5
|
|
|
6
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
|
+
|
|
7
8
|
|
|
8
9
|
class Min(PandasOperand):
|
|
9
10
|
name = "min"
|
upgini/autofe/date.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from typing import Any, Optional, Union
|
|
2
|
+
|
|
2
3
|
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
5
|
+
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
4
6
|
from pydantic import BaseModel
|
|
5
7
|
|
|
6
8
|
from upgini.autofe.operand import PandasOperand
|
|
@@ -73,8 +75,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
73
75
|
|
|
74
76
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
75
77
|
|
|
76
|
-
def _diff(self, x):
|
|
77
|
-
|
|
78
|
+
def _diff(self, x: TimedeltaArray):
|
|
79
|
+
if self.diff_unit == "Y":
|
|
80
|
+
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
81
|
+
elif self.diff_unit == "M":
|
|
82
|
+
raise Exception("Unsupported difference unit: Month")
|
|
83
|
+
else:
|
|
84
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
78
85
|
return x[x > 0]
|
|
79
86
|
|
|
80
87
|
def _agg(self, x):
|
upgini/autofe/feature.py
CHANGED
upgini/autofe/groupby.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
from typing import Optional
|
|
2
|
+
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
7
9
|
agg: Optional[str]
|
upgini/autofe/operand.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
3
1
|
import abc
|
|
4
|
-
import
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
5
4
|
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import BaseModel
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Operand(BaseModel):
|
upgini/autofe/unary.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
|
|
4
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
5
|
+
|
|
5
6
|
|
|
6
7
|
class Abs(PandasOperand, VectorizableMixin):
|
|
7
8
|
name = "abs"
|
|
@@ -110,3 +111,21 @@ class Freq(PandasOperand):
|
|
|
110
111
|
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
111
112
|
value_counts = data.value_counts(normalize=True)
|
|
112
113
|
return self._loc(data, value_counts)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class Bin(PandasOperand):
|
|
117
|
+
name = "bin"
|
|
118
|
+
is_unary = True
|
|
119
|
+
output_type = "int"
|
|
120
|
+
input_type = "discrete"
|
|
121
|
+
|
|
122
|
+
zero_bound_low: int
|
|
123
|
+
zero_bound_high: int
|
|
124
|
+
step: int
|
|
125
|
+
|
|
126
|
+
def calculate_unary(self, data: pd.Series) -> pd.Series:
|
|
127
|
+
res = pd.Series(np.zeros(data.shape), index=data.index, dtype="int")
|
|
128
|
+
res.update((data[data < self.zero_bound_low] - self.zero_bound_low) // self.step)
|
|
129
|
+
res.update((data[data >= self.zero_bound_high] - self.zero_bound_high) // self.step + 1)
|
|
130
|
+
|
|
131
|
+
return res
|
upgini/autofe/vector.py
CHANGED
upgini/dataset.py
CHANGED
|
@@ -15,6 +15,7 @@ from pandas.api.types import (
|
|
|
15
15
|
is_float_dtype,
|
|
16
16
|
is_integer_dtype,
|
|
17
17
|
is_numeric_dtype,
|
|
18
|
+
is_object_dtype,
|
|
18
19
|
is_period_dtype,
|
|
19
20
|
is_string_dtype,
|
|
20
21
|
)
|
|
@@ -94,7 +95,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
94
95
|
data = pd.read_csv(path, **kwargs)
|
|
95
96
|
else:
|
|
96
97
|
# try different separators: , ; \t ...
|
|
97
|
-
with open(path
|
|
98
|
+
with open(path) as csvfile:
|
|
98
99
|
sep = csv.Sniffer().sniff(csvfile.read(2048)).delimiter
|
|
99
100
|
kwargs["sep"] = sep
|
|
100
101
|
data = pd.read_csv(path, **kwargs)
|
|
@@ -219,7 +220,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
219
220
|
"""Check that string values less than maximum characters for LLM"""
|
|
220
221
|
# self.logger.info("Validate too long string values")
|
|
221
222
|
for col in self.data.columns:
|
|
222
|
-
if is_string_dtype(self.data[col]):
|
|
223
|
+
if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
|
|
223
224
|
max_length: int = self.data[col].astype("str").str.len().max()
|
|
224
225
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
225
226
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
@@ -250,7 +251,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
250
251
|
@staticmethod
|
|
251
252
|
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
252
253
|
try:
|
|
253
|
-
if isinstance(ip, IPv4Address
|
|
254
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
254
255
|
return int(ip)
|
|
255
256
|
except Exception:
|
|
256
257
|
pass
|
|
@@ -258,7 +259,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
258
259
|
@staticmethod
|
|
259
260
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
260
261
|
try:
|
|
261
|
-
if isinstance(ip, IPv4Address
|
|
262
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
262
263
|
return str(int(ip))
|
|
263
264
|
except Exception:
|
|
264
265
|
pass
|
|
@@ -350,7 +351,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
350
351
|
if postal_code is not None and postal_code in self.data.columns:
|
|
351
352
|
# self.logger.info("Normalize postal code")
|
|
352
353
|
|
|
353
|
-
if is_string_dtype(self.data[postal_code]):
|
|
354
|
+
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
354
355
|
try:
|
|
355
356
|
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
356
357
|
except Exception:
|
|
@@ -821,7 +822,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
821
822
|
return DataType.INT
|
|
822
823
|
elif is_float_dtype(pandas_data_type):
|
|
823
824
|
return DataType.DECIMAL
|
|
824
|
-
elif is_string_dtype(pandas_data_type):
|
|
825
|
+
elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
|
|
825
826
|
return DataType.STRING
|
|
826
827
|
else:
|
|
827
828
|
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
upgini/errors.py
CHANGED
|
@@ -16,7 +16,7 @@ class UnauthorizedError(HttpError):
|
|
|
16
16
|
"""Unauthorized error from REST API."""
|
|
17
17
|
|
|
18
18
|
def __init__(self, message, status_code):
|
|
19
|
-
message = "Unauthorized, please check your authorization token ({})"
|
|
19
|
+
message = f"Unauthorized, please check your authorization token ({message})"
|
|
20
20
|
super(UnauthorizedError, self).__init__(message, status_code)
|
|
21
21
|
|
|
22
22
|
|
upgini/features_enricher.py
CHANGED
|
@@ -21,6 +21,7 @@ from pandas.api.types import (
|
|
|
21
21
|
is_bool,
|
|
22
22
|
is_datetime64_any_dtype,
|
|
23
23
|
is_numeric_dtype,
|
|
24
|
+
is_object_dtype,
|
|
24
25
|
is_period_dtype,
|
|
25
26
|
is_string_dtype,
|
|
26
27
|
)
|
|
@@ -424,6 +425,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
424
425
|
|
|
425
426
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
426
427
|
|
|
428
|
+
# Validate client estimator params
|
|
429
|
+
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
430
|
+
|
|
427
431
|
try:
|
|
428
432
|
self.X = X
|
|
429
433
|
self.y = y
|
|
@@ -817,6 +821,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
817
821
|
trace_id = trace_id or str(uuid.uuid4())
|
|
818
822
|
start_time = time.time()
|
|
819
823
|
with MDC(trace_id=trace_id):
|
|
824
|
+
self.logger.info("Start calculate metrics")
|
|
820
825
|
if len(args) > 0:
|
|
821
826
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
822
827
|
self.logger.warning(msg)
|
|
@@ -868,22 +873,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
868
873
|
self.__display_support_link(msg)
|
|
869
874
|
return None
|
|
870
875
|
|
|
871
|
-
cat_features =
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
estimator is not None
|
|
875
|
-
and hasattr(estimator, "get_param")
|
|
876
|
-
and estimator.get_param("cat_features") is not None
|
|
877
|
-
):
|
|
878
|
-
cat_features = estimator.get_param("cat_features")
|
|
879
|
-
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
880
|
-
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
881
|
-
for cat_feature in cat_features:
|
|
882
|
-
if cat_feature in self.search_keys:
|
|
883
|
-
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
884
|
-
search_keys_for_metrics.append(cat_feature)
|
|
885
|
-
else:
|
|
886
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
876
|
+
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
877
|
+
estimator, effective_X, self.search_keys
|
|
878
|
+
)
|
|
887
879
|
|
|
888
880
|
prepared_data = self._prepare_data_for_metrics(
|
|
889
881
|
trace_id=trace_id,
|
|
@@ -898,6 +890,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
898
890
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
899
891
|
progress_bar=progress_bar,
|
|
900
892
|
progress_callback=progress_callback,
|
|
893
|
+
cat_features=cat_features,
|
|
901
894
|
)
|
|
902
895
|
if prepared_data is None:
|
|
903
896
|
return None
|
|
@@ -1273,6 +1266,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1273
1266
|
|
|
1274
1267
|
return _cv, groups
|
|
1275
1268
|
|
|
1269
|
+
def _get_client_cat_features(
|
|
1270
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1271
|
+
) -> Optional[List[str]]:
|
|
1272
|
+
cat_features = None
|
|
1273
|
+
search_keys_for_metrics = []
|
|
1274
|
+
if (
|
|
1275
|
+
estimator is not None
|
|
1276
|
+
and hasattr(estimator, "get_param")
|
|
1277
|
+
and estimator.get_param("cat_features") is not None
|
|
1278
|
+
):
|
|
1279
|
+
cat_features = estimator.get_param("cat_features")
|
|
1280
|
+
if len(cat_features) > 0:
|
|
1281
|
+
if all([isinstance(f, int) for f in cat_features]):
|
|
1282
|
+
cat_features = [X.columns[i] for i in cat_features]
|
|
1283
|
+
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1284
|
+
for cat_feature in cat_features:
|
|
1285
|
+
if cat_feature in search_keys:
|
|
1286
|
+
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1287
|
+
search_keys_for_metrics.append(cat_feature)
|
|
1288
|
+
else:
|
|
1289
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1290
|
+
return cat_features, search_keys_for_metrics
|
|
1291
|
+
|
|
1276
1292
|
def _prepare_data_for_metrics(
|
|
1277
1293
|
self,
|
|
1278
1294
|
trace_id: str,
|
|
@@ -1287,6 +1303,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1287
1303
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1288
1304
|
progress_bar: Optional[ProgressBar] = None,
|
|
1289
1305
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1306
|
+
cat_features: Optional[List[str]] = None,
|
|
1290
1307
|
):
|
|
1291
1308
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1292
1309
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1344,9 +1361,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1344
1361
|
|
|
1345
1362
|
# Detect and drop high cardinality columns in train
|
|
1346
1363
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
]
|
|
1364
|
+
non_excluding_columns = (self.generate_features or []) + (cat_features or [])
|
|
1365
|
+
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
|
1350
1366
|
if len(columns_with_high_cardinality) > 0:
|
|
1351
1367
|
self.logger.warning(
|
|
1352
1368
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -2532,7 +2548,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2532
2548
|
validated_X = X.copy()
|
|
2533
2549
|
elif isinstance(X, pd.Series):
|
|
2534
2550
|
validated_X = X.to_frame()
|
|
2535
|
-
elif isinstance(X, np.ndarray)
|
|
2551
|
+
elif isinstance(X, (list, np.ndarray)):
|
|
2536
2552
|
validated_X = pd.DataFrame(X)
|
|
2537
2553
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2538
2554
|
validated_X = validated_X.rename(columns=renaming)
|
|
@@ -2621,7 +2637,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2621
2637
|
validated_eval_X = eval_X.copy()
|
|
2622
2638
|
elif isinstance(eval_X, pd.Series):
|
|
2623
2639
|
validated_eval_X = eval_X.to_frame()
|
|
2624
|
-
elif isinstance(eval_X, np.ndarray)
|
|
2640
|
+
elif isinstance(eval_X, (list, np.ndarray)):
|
|
2625
2641
|
validated_eval_X = pd.DataFrame(eval_X)
|
|
2626
2642
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2627
2643
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
@@ -2803,7 +2819,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2803
2819
|
)
|
|
2804
2820
|
|
|
2805
2821
|
def sample(df):
|
|
2806
|
-
if isinstance(df, pd.
|
|
2822
|
+
if isinstance(df, (pd.DataFrame, pd.Series)):
|
|
2807
2823
|
return df.head(10)
|
|
2808
2824
|
else:
|
|
2809
2825
|
return df[:10]
|
|
@@ -2967,7 +2983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2967
2983
|
|
|
2968
2984
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2969
2985
|
target = df[self.TARGET_NAME]
|
|
2970
|
-
if is_string_dtype(target):
|
|
2986
|
+
if is_string_dtype(target) or is_object_dtype(target):
|
|
2971
2987
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
2972
2988
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
2973
2989
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3240,6 +3256,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3240
3256
|
descriptions = []
|
|
3241
3257
|
for m in autofe_meta:
|
|
3242
3258
|
autofe_feature = Feature.from_formula(m.formula)
|
|
3259
|
+
orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
|
|
3260
|
+
autofe_feature.rename_columns(orig_to_hashed)
|
|
3243
3261
|
autofe_feature.set_display_index(m.display_index)
|
|
3244
3262
|
if autofe_feature.op.is_vector:
|
|
3245
3263
|
continue
|
|
@@ -3367,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3367
3385
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3368
3386
|
else:
|
|
3369
3387
|
if x[column_name].isnull().all() or (
|
|
3370
|
-
is_string_dtype(x[column_name])
|
|
3388
|
+
(is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
|
|
3389
|
+
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3371
3390
|
):
|
|
3372
3391
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3373
3392
|
|
|
@@ -3674,7 +3693,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3674
3693
|
def sample(inp, sample_index):
|
|
3675
3694
|
if _num_samples(inp) <= 1000:
|
|
3676
3695
|
return inp
|
|
3677
|
-
if isinstance(inp, pd.DataFrame
|
|
3696
|
+
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
|
3678
3697
|
return inp.sample(n=1000, random_state=random_state)
|
|
3679
3698
|
if isinstance(inp, np.ndarray):
|
|
3680
3699
|
return inp[sample_index]
|
upgini/http.py
CHANGED
|
@@ -22,6 +22,7 @@ from pydantic import BaseModel
|
|
|
22
22
|
from pythonjsonlogger import jsonlogger
|
|
23
23
|
from requests.exceptions import RequestException
|
|
24
24
|
|
|
25
|
+
from upgini.__about__ import __version__
|
|
25
26
|
from upgini.errors import (
|
|
26
27
|
HttpError,
|
|
27
28
|
UnauthorizedError,
|
|
@@ -38,17 +39,17 @@ from upgini.metadata import (
|
|
|
38
39
|
from upgini.resource_bundle import bundle
|
|
39
40
|
from upgini.utils.track_info import get_track_metrics
|
|
40
41
|
|
|
41
|
-
try:
|
|
42
|
-
|
|
42
|
+
# try:
|
|
43
|
+
# from importlib.metadata import version # type: ignore
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
except ImportError:
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
# __version__ = version("upgini")
|
|
46
|
+
# except ImportError:
|
|
47
|
+
# try:
|
|
48
|
+
# from importlib_metadata import version # type: ignore
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
# __version__ = version("upgini")
|
|
51
|
+
# except ImportError:
|
|
52
|
+
# __version__ = "Upgini wasn't installed"
|
|
52
53
|
|
|
53
54
|
UPGINI_URL: str = "UPGINI_URL"
|
|
54
55
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
@@ -925,7 +926,7 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
|
925
926
|
return api_token is None or api_token == "" or api_token == DEMO_API_KEY
|
|
926
927
|
|
|
927
928
|
|
|
928
|
-
@lru_cache
|
|
929
|
+
@lru_cache
|
|
929
930
|
def _get_rest_client(
|
|
930
931
|
backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
|
|
931
932
|
) -> _RestClient:
|
upgini/mdc/__init__.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
.. module: mdc
|
|
4
3
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
5
4
|
"""
|
|
6
|
-
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
7
5
|
|
|
8
6
|
import logging
|
|
9
7
|
|
|
10
|
-
from upgini.mdc.context import new_log_context, get_mdc_fields
|
|
11
8
|
from pythonjsonlogger import jsonlogger
|
|
12
9
|
|
|
10
|
+
from upgini.mdc.context import get_mdc_fields, new_log_context
|
|
13
11
|
|
|
14
12
|
MDContext = new_log_context
|
|
15
13
|
MDC = new_log_context
|
upgini/mdc/context.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
.. module: TODO
|
|
4
3
|
:platform: TODO
|
|
@@ -7,12 +6,11 @@
|
|
|
7
6
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
8
7
|
"""
|
|
9
8
|
|
|
10
|
-
import
|
|
11
|
-
import uuid
|
|
9
|
+
import collections
|
|
12
10
|
import logging
|
|
13
11
|
import threading
|
|
14
|
-
import
|
|
15
|
-
|
|
12
|
+
import time
|
|
13
|
+
import uuid
|
|
16
14
|
from contextlib import contextmanager
|
|
17
15
|
|
|
18
16
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -32,7 +30,7 @@ def get_mdc_fields():
|
|
|
32
30
|
|
|
33
31
|
@contextmanager
|
|
34
32
|
def new_log_context(**kwargs):
|
|
35
|
-
context_id = "mdc-{
|
|
33
|
+
context_id = f"mdc-{threading.current_thread().ident}-{uuid.uuid4()}"
|
|
36
34
|
|
|
37
35
|
LOGGER.debug("creating context %s", context_id)
|
|
38
36
|
|
upgini/metadata.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from enum import Enum
|
|
2
4
|
from typing import Dict, List, Optional, Set
|
|
3
5
|
|
|
@@ -201,6 +203,7 @@ class FileMetadata(BaseModel):
|
|
|
201
203
|
for c in self.columns:
|
|
202
204
|
if c.name == name:
|
|
203
205
|
return c
|
|
206
|
+
return None
|
|
204
207
|
|
|
205
208
|
def search_types(self) -> Set[SearchKey]:
|
|
206
209
|
search_keys = set()
|