upgini 1.1.274a4__py3-none-any.whl → 1.1.280.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -0
- upgini/ads.py +6 -2
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +3 -2
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +9 -2
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +2 -1
- upgini/autofe/vector.py +2 -0
- upgini/dataset.py +7 -6
- upgini/errors.py +1 -1
- upgini/features_enricher.py +52 -27
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +3 -0
- upgini/metrics.py +110 -97
- upgini/normalizer/phone_normalizer.py +1 -1
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +1 -1
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +3 -2
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +2 -2
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +25 -19
- upgini/utils/email_utils.py +3 -3
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/features_validator.py +2 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +14 -13
- upgini/utils/target_utils.py +1 -1
- upgini/utils/track_info.py +27 -15
- upgini/version_validator.py +2 -2
- {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info}/METADATA +21 -23
- upgini-1.1.280.dev0.dist-info/RECORD +62 -0
- {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info}/WHEEL +1 -2
- upgini/fingerprint.js +0 -8
- upgini-1.1.274a4.dist-info/RECORD +0 -63
- upgini-1.1.274a4.dist-info/top_level.txt +0 -1
- {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info/licenses}/LICENSE +0 -0
upgini/__about__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.1.280.dev0"
|
upgini/ads.py
CHANGED
|
@@ -5,7 +5,7 @@ from typing import Dict, Optional
|
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
8
|
-
from pandas.api.types import is_string_dtype
|
|
8
|
+
from pandas.api.types import is_object_dtype, is_string_dtype
|
|
9
9
|
|
|
10
10
|
from upgini import SearchKey
|
|
11
11
|
from upgini.http import get_rest_client
|
|
@@ -34,7 +34,11 @@ def upload_user_ads(name: str, df: pd.DataFrame, search_keys: Dict[str, SearchKe
|
|
|
34
34
|
if df[column_name].notnull().sum() < min_valid_rows_count:
|
|
35
35
|
raise ValueError(bundle.get("ads_upload_to_many_empty_rows"))
|
|
36
36
|
meaning_type = search_keys[column_name].value
|
|
37
|
-
if
|
|
37
|
+
if (
|
|
38
|
+
meaning_type == FileColumnMeaningType.MSISDN
|
|
39
|
+
and not is_string_dtype(df[column_name])
|
|
40
|
+
and not is_object_dtype(df[column_name])
|
|
41
|
+
):
|
|
38
42
|
df[column_name] = df[column_name].values.astype(np.int64).astype("string") # type: ignore
|
|
39
43
|
else:
|
|
40
44
|
meaning_type = FileColumnMeaningType.FEATURE
|
upgini/autofe/all_operands.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from upgini.autofe.binary import Add, Divide, Max, Min, Multiply, Sim, Subtract
|
|
2
4
|
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
5
|
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
4
6
|
from upgini.autofe.operand import Operand
|
|
5
|
-
from upgini.autofe.unary import Abs,
|
|
6
|
-
from upgini.autofe.binary import Min, Max, Add, Subtract, Multiply, Divide, Sim
|
|
7
|
+
from upgini.autofe.unary import Abs, Floor, Freq, Log, Residual, Sigmoid, Sqrt, Square
|
|
7
8
|
from upgini.autofe.vector import Mean, Sum
|
|
8
9
|
|
|
9
10
|
ALL_OPERANDS: Dict[str, Operand] = {
|
upgini/autofe/binary.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
import numpy as np
|
|
3
2
|
import pandas as pd
|
|
4
3
|
from numpy import dot
|
|
5
4
|
from numpy.linalg import norm
|
|
6
5
|
|
|
6
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
7
|
+
|
|
7
8
|
|
|
8
9
|
class Min(PandasOperand):
|
|
9
10
|
name = "min"
|
upgini/autofe/date.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
from typing import Any, Optional, Union
|
|
2
|
+
|
|
2
3
|
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
5
|
+
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
4
6
|
from pydantic import BaseModel
|
|
5
7
|
|
|
6
8
|
from upgini.autofe.operand import PandasOperand
|
|
@@ -73,8 +75,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
73
75
|
|
|
74
76
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
75
77
|
|
|
76
|
-
def _diff(self, x):
|
|
77
|
-
|
|
78
|
+
def _diff(self, x: TimedeltaArray):
|
|
79
|
+
if self.diff_unit == "Y":
|
|
80
|
+
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
81
|
+
elif self.diff_unit == "M":
|
|
82
|
+
raise Exception("Unsupported difference unit: Month")
|
|
83
|
+
else:
|
|
84
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
78
85
|
return x[x > 0]
|
|
79
86
|
|
|
80
87
|
def _agg(self, x):
|
upgini/autofe/feature.py
CHANGED
upgini/autofe/groupby.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
2
1
|
from typing import Optional
|
|
2
|
+
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
class GroupByThenAgg(PandasOperand, VectorizableMixin):
|
|
7
9
|
agg: Optional[str]
|
upgini/autofe/operand.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
|
-
from typing import Dict, List, Optional, Tuple, Union
|
|
3
1
|
import abc
|
|
4
|
-
import
|
|
2
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
3
|
+
|
|
5
4
|
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from pydantic import BaseModel
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Operand(BaseModel):
|
upgini/autofe/unary.py
CHANGED
upgini/autofe/vector.py
CHANGED
upgini/dataset.py
CHANGED
|
@@ -15,6 +15,7 @@ from pandas.api.types import (
|
|
|
15
15
|
is_float_dtype,
|
|
16
16
|
is_integer_dtype,
|
|
17
17
|
is_numeric_dtype,
|
|
18
|
+
is_object_dtype,
|
|
18
19
|
is_period_dtype,
|
|
19
20
|
is_string_dtype,
|
|
20
21
|
)
|
|
@@ -94,7 +95,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
94
95
|
data = pd.read_csv(path, **kwargs)
|
|
95
96
|
else:
|
|
96
97
|
# try different separators: , ; \t ...
|
|
97
|
-
with open(path
|
|
98
|
+
with open(path) as csvfile:
|
|
98
99
|
sep = csv.Sniffer().sniff(csvfile.read(2048)).delimiter
|
|
99
100
|
kwargs["sep"] = sep
|
|
100
101
|
data = pd.read_csv(path, **kwargs)
|
|
@@ -219,7 +220,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
219
220
|
"""Check that string values less than maximum characters for LLM"""
|
|
220
221
|
# self.logger.info("Validate too long string values")
|
|
221
222
|
for col in self.data.columns:
|
|
222
|
-
if is_string_dtype(self.data[col]):
|
|
223
|
+
if is_string_dtype(self.data[col]) or is_object_dtype(self.data[col]):
|
|
223
224
|
max_length: int = self.data[col].astype("str").str.len().max()
|
|
224
225
|
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
225
226
|
self.data[col] = self.data[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
@@ -250,7 +251,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
250
251
|
@staticmethod
|
|
251
252
|
def _ip_to_int(ip: Optional[_BaseAddress]) -> Optional[int]:
|
|
252
253
|
try:
|
|
253
|
-
if isinstance(ip, IPv4Address
|
|
254
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
254
255
|
return int(ip)
|
|
255
256
|
except Exception:
|
|
256
257
|
pass
|
|
@@ -258,7 +259,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
258
259
|
@staticmethod
|
|
259
260
|
def _ip_to_int_str(ip: Optional[_BaseAddress]) -> Optional[str]:
|
|
260
261
|
try:
|
|
261
|
-
if isinstance(ip, IPv4Address
|
|
262
|
+
if isinstance(ip, (IPv4Address, IPv6Address)):
|
|
262
263
|
return str(int(ip))
|
|
263
264
|
except Exception:
|
|
264
265
|
pass
|
|
@@ -350,7 +351,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
350
351
|
if postal_code is not None and postal_code in self.data.columns:
|
|
351
352
|
# self.logger.info("Normalize postal code")
|
|
352
353
|
|
|
353
|
-
if is_string_dtype(self.data[postal_code]):
|
|
354
|
+
if is_string_dtype(self.data[postal_code]) or is_object_dtype(self.data[postal_code]):
|
|
354
355
|
try:
|
|
355
356
|
self.data[postal_code] = self.data[postal_code].astype("float64").astype("Int64").astype("string")
|
|
356
357
|
except Exception:
|
|
@@ -821,7 +822,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
821
822
|
return DataType.INT
|
|
822
823
|
elif is_float_dtype(pandas_data_type):
|
|
823
824
|
return DataType.DECIMAL
|
|
824
|
-
elif is_string_dtype(pandas_data_type):
|
|
825
|
+
elif is_string_dtype(pandas_data_type) or is_object_dtype(pandas_data_type):
|
|
825
826
|
return DataType.STRING
|
|
826
827
|
else:
|
|
827
828
|
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
upgini/errors.py
CHANGED
|
@@ -16,7 +16,7 @@ class UnauthorizedError(HttpError):
|
|
|
16
16
|
"""Unauthorized error from REST API."""
|
|
17
17
|
|
|
18
18
|
def __init__(self, message, status_code):
|
|
19
|
-
message = "Unauthorized, please check your authorization token ({})"
|
|
19
|
+
message = f"Unauthorized, please check your authorization token ({message})"
|
|
20
20
|
super(UnauthorizedError, self).__init__(message, status_code)
|
|
21
21
|
|
|
22
22
|
|
upgini/features_enricher.py
CHANGED
|
@@ -21,6 +21,7 @@ from pandas.api.types import (
|
|
|
21
21
|
is_bool,
|
|
22
22
|
is_datetime64_any_dtype,
|
|
23
23
|
is_numeric_dtype,
|
|
24
|
+
is_object_dtype,
|
|
24
25
|
is_period_dtype,
|
|
25
26
|
is_string_dtype,
|
|
26
27
|
)
|
|
@@ -424,6 +425,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
424
425
|
|
|
425
426
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
426
427
|
|
|
428
|
+
# Validate client estimator params
|
|
429
|
+
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
430
|
+
|
|
427
431
|
try:
|
|
428
432
|
self.X = X
|
|
429
433
|
self.y = y
|
|
@@ -817,6 +821,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
817
821
|
trace_id = trace_id or str(uuid.uuid4())
|
|
818
822
|
start_time = time.time()
|
|
819
823
|
with MDC(trace_id=trace_id):
|
|
824
|
+
self.logger.info("Start calculate metrics")
|
|
820
825
|
if len(args) > 0:
|
|
821
826
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
822
827
|
self.logger.warning(msg)
|
|
@@ -868,22 +873,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
868
873
|
self.__display_support_link(msg)
|
|
869
874
|
return None
|
|
870
875
|
|
|
871
|
-
cat_features =
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
estimator is not None
|
|
875
|
-
and hasattr(estimator, "get_param")
|
|
876
|
-
and estimator.get_param("cat_features") is not None
|
|
877
|
-
):
|
|
878
|
-
cat_features = estimator.get_param("cat_features")
|
|
879
|
-
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
880
|
-
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
881
|
-
for cat_feature in cat_features:
|
|
882
|
-
if cat_feature in self.search_keys:
|
|
883
|
-
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
884
|
-
search_keys_for_metrics.append(cat_feature)
|
|
885
|
-
else:
|
|
886
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
876
|
+
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
877
|
+
estimator, effective_X, self.search_keys
|
|
878
|
+
)
|
|
887
879
|
|
|
888
880
|
prepared_data = self._prepare_data_for_metrics(
|
|
889
881
|
trace_id=trace_id,
|
|
@@ -898,6 +890,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
898
890
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
899
891
|
progress_bar=progress_bar,
|
|
900
892
|
progress_callback=progress_callback,
|
|
893
|
+
cat_features=cat_features,
|
|
901
894
|
)
|
|
902
895
|
if prepared_data is None:
|
|
903
896
|
return None
|
|
@@ -1273,6 +1266,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1273
1266
|
|
|
1274
1267
|
return _cv, groups
|
|
1275
1268
|
|
|
1269
|
+
def _get_client_cat_features(
|
|
1270
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1271
|
+
) -> Optional[List[str]]:
|
|
1272
|
+
cat_features = None
|
|
1273
|
+
search_keys_for_metrics = []
|
|
1274
|
+
if (
|
|
1275
|
+
estimator is not None
|
|
1276
|
+
and hasattr(estimator, "get_param")
|
|
1277
|
+
and estimator.get_param("cat_features") is not None
|
|
1278
|
+
):
|
|
1279
|
+
cat_features = estimator.get_param("cat_features")
|
|
1280
|
+
if len(cat_features) > 0:
|
|
1281
|
+
if all([isinstance(f, int) for f in cat_features]):
|
|
1282
|
+
cat_features = [X.columns[i] for i in cat_features]
|
|
1283
|
+
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1284
|
+
for cat_feature in cat_features:
|
|
1285
|
+
if cat_feature in search_keys:
|
|
1286
|
+
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1287
|
+
search_keys_for_metrics.append(cat_feature)
|
|
1288
|
+
else:
|
|
1289
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1290
|
+
return cat_features, search_keys_for_metrics
|
|
1291
|
+
|
|
1276
1292
|
def _prepare_data_for_metrics(
|
|
1277
1293
|
self,
|
|
1278
1294
|
trace_id: str,
|
|
@@ -1287,6 +1303,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1287
1303
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1288
1304
|
progress_bar: Optional[ProgressBar] = None,
|
|
1289
1305
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1306
|
+
cat_features: Optional[List[str]] = None,
|
|
1290
1307
|
):
|
|
1291
1308
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1292
1309
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1316,6 +1333,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1316
1333
|
excluding_search_keys = list(search_keys.keys())
|
|
1317
1334
|
if search_keys_for_metrics is not None and len(search_keys_for_metrics) > 0:
|
|
1318
1335
|
excluding_search_keys = [sk for sk in excluding_search_keys if sk not in search_keys_for_metrics]
|
|
1336
|
+
meta = self._search_task.get_all_features_metadata_v2()
|
|
1337
|
+
zero_importance_client_features = [m for m in meta if m.source == "etalon" and m.shap_value == 0.0]
|
|
1338
|
+
|
|
1319
1339
|
client_features = [
|
|
1320
1340
|
c
|
|
1321
1341
|
for c in X_sampled.columns.to_list()
|
|
@@ -1324,6 +1344,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1324
1344
|
excluding_search_keys
|
|
1325
1345
|
+ list(self.fit_dropped_features)
|
|
1326
1346
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID]
|
|
1347
|
+
+ zero_importance_client_features
|
|
1327
1348
|
)
|
|
1328
1349
|
]
|
|
1329
1350
|
|
|
@@ -1344,9 +1365,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1344
1365
|
|
|
1345
1366
|
# Detect and drop high cardinality columns in train
|
|
1346
1367
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
]
|
|
1368
|
+
non_excluding_columns = (self.generate_features or []) + (cat_features or [])
|
|
1369
|
+
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
|
1350
1370
|
if len(columns_with_high_cardinality) > 0:
|
|
1351
1371
|
self.logger.warning(
|
|
1352
1372
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -2532,7 +2552,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2532
2552
|
validated_X = X.copy()
|
|
2533
2553
|
elif isinstance(X, pd.Series):
|
|
2534
2554
|
validated_X = X.to_frame()
|
|
2535
|
-
elif isinstance(X, np.ndarray)
|
|
2555
|
+
elif isinstance(X, (list, np.ndarray)):
|
|
2536
2556
|
validated_X = pd.DataFrame(X)
|
|
2537
2557
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2538
2558
|
validated_X = validated_X.rename(columns=renaming)
|
|
@@ -2621,7 +2641,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2621
2641
|
validated_eval_X = eval_X.copy()
|
|
2622
2642
|
elif isinstance(eval_X, pd.Series):
|
|
2623
2643
|
validated_eval_X = eval_X.to_frame()
|
|
2624
|
-
elif isinstance(eval_X, np.ndarray)
|
|
2644
|
+
elif isinstance(eval_X, (list, np.ndarray)):
|
|
2625
2645
|
validated_eval_X = pd.DataFrame(eval_X)
|
|
2626
2646
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2627
2647
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
@@ -2803,7 +2823,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2803
2823
|
)
|
|
2804
2824
|
|
|
2805
2825
|
def sample(df):
|
|
2806
|
-
if isinstance(df, pd.
|
|
2826
|
+
if isinstance(df, (pd.DataFrame, pd.Series)):
|
|
2807
2827
|
return df.head(10)
|
|
2808
2828
|
else:
|
|
2809
2829
|
return df[:10]
|
|
@@ -2829,8 +2849,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2829
2849
|
maybe_date_col = self._get_date_column(self.search_keys)
|
|
2830
2850
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2831
2851
|
# TODO cast date column to single dtype
|
|
2832
|
-
|
|
2833
|
-
|
|
2852
|
+
date_converter = DateTimeSearchKeyConverter(maybe_date_col, self.date_format)
|
|
2853
|
+
converted_X = date_converter.convert(X)
|
|
2854
|
+
min_date = converted_X[maybe_date_col].min()
|
|
2855
|
+
max_date = converted_X[maybe_date_col].max()
|
|
2834
2856
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
|
2835
2857
|
|
|
2836
2858
|
except Exception:
|
|
@@ -2967,7 +2989,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2967
2989
|
|
|
2968
2990
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2969
2991
|
target = df[self.TARGET_NAME]
|
|
2970
|
-
if is_string_dtype(target):
|
|
2992
|
+
if is_string_dtype(target) or is_object_dtype(target):
|
|
2971
2993
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
2972
2994
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
2973
2995
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3240,6 +3262,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3240
3262
|
descriptions = []
|
|
3241
3263
|
for m in autofe_meta:
|
|
3242
3264
|
autofe_feature = Feature.from_formula(m.formula)
|
|
3265
|
+
orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
|
|
3266
|
+
autofe_feature.rename_columns(orig_to_hashed)
|
|
3243
3267
|
autofe_feature.set_display_index(m.display_index)
|
|
3244
3268
|
if autofe_feature.op.is_vector:
|
|
3245
3269
|
continue
|
|
@@ -3367,7 +3391,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3367
3391
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3368
3392
|
else:
|
|
3369
3393
|
if x[column_name].isnull().all() or (
|
|
3370
|
-
is_string_dtype(x[column_name])
|
|
3394
|
+
(is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
|
|
3395
|
+
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3371
3396
|
):
|
|
3372
3397
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3373
3398
|
|
|
@@ -3674,7 +3699,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3674
3699
|
def sample(inp, sample_index):
|
|
3675
3700
|
if _num_samples(inp) <= 1000:
|
|
3676
3701
|
return inp
|
|
3677
|
-
if isinstance(inp, pd.DataFrame
|
|
3702
|
+
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
|
3678
3703
|
return inp.sample(n=1000, random_state=random_state)
|
|
3679
3704
|
if isinstance(inp, np.ndarray):
|
|
3680
3705
|
return inp[sample_index]
|
upgini/http.py
CHANGED
|
@@ -22,6 +22,7 @@ from pydantic import BaseModel
|
|
|
22
22
|
from pythonjsonlogger import jsonlogger
|
|
23
23
|
from requests.exceptions import RequestException
|
|
24
24
|
|
|
25
|
+
from upgini.__about__ import __version__
|
|
25
26
|
from upgini.errors import (
|
|
26
27
|
HttpError,
|
|
27
28
|
UnauthorizedError,
|
|
@@ -38,17 +39,17 @@ from upgini.metadata import (
|
|
|
38
39
|
from upgini.resource_bundle import bundle
|
|
39
40
|
from upgini.utils.track_info import get_track_metrics
|
|
40
41
|
|
|
41
|
-
try:
|
|
42
|
-
|
|
42
|
+
# try:
|
|
43
|
+
# from importlib.metadata import version # type: ignore
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
except ImportError:
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
# __version__ = version("upgini")
|
|
46
|
+
# except ImportError:
|
|
47
|
+
# try:
|
|
48
|
+
# from importlib_metadata import version # type: ignore
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
# __version__ = version("upgini")
|
|
51
|
+
# except ImportError:
|
|
52
|
+
# __version__ = "Upgini wasn't installed"
|
|
52
53
|
|
|
53
54
|
UPGINI_URL: str = "UPGINI_URL"
|
|
54
55
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
@@ -925,7 +926,7 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
|
925
926
|
return api_token is None or api_token == "" or api_token == DEMO_API_KEY
|
|
926
927
|
|
|
927
928
|
|
|
928
|
-
@lru_cache
|
|
929
|
+
@lru_cache
|
|
929
930
|
def _get_rest_client(
|
|
930
931
|
backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
|
|
931
932
|
) -> _RestClient:
|
upgini/mdc/__init__.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
.. module: mdc
|
|
4
3
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
5
4
|
"""
|
|
6
|
-
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
7
5
|
|
|
8
6
|
import logging
|
|
9
7
|
|
|
10
|
-
from upgini.mdc.context import new_log_context, get_mdc_fields
|
|
11
8
|
from pythonjsonlogger import jsonlogger
|
|
12
9
|
|
|
10
|
+
from upgini.mdc.context import get_mdc_fields, new_log_context
|
|
13
11
|
|
|
14
12
|
MDContext = new_log_context
|
|
15
13
|
MDC = new_log_context
|
upgini/mdc/context.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
.. module: TODO
|
|
4
3
|
:platform: TODO
|
|
@@ -7,12 +6,11 @@
|
|
|
7
6
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
8
7
|
"""
|
|
9
8
|
|
|
10
|
-
import
|
|
11
|
-
import uuid
|
|
9
|
+
import collections
|
|
12
10
|
import logging
|
|
13
11
|
import threading
|
|
14
|
-
import
|
|
15
|
-
|
|
12
|
+
import time
|
|
13
|
+
import uuid
|
|
16
14
|
from contextlib import contextmanager
|
|
17
15
|
|
|
18
16
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -32,7 +30,7 @@ def get_mdc_fields():
|
|
|
32
30
|
|
|
33
31
|
@contextmanager
|
|
34
32
|
def new_log_context(**kwargs):
|
|
35
|
-
context_id = "mdc-{
|
|
33
|
+
context_id = f"mdc-{threading.current_thread().ident}-{uuid.uuid4()}"
|
|
36
34
|
|
|
37
35
|
LOGGER.debug("creating context %s", context_id)
|
|
38
36
|
|
upgini/metadata.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from enum import Enum
|
|
2
4
|
from typing import Dict, List, Optional, Set
|
|
3
5
|
|
|
@@ -201,6 +203,7 @@ class FileMetadata(BaseModel):
|
|
|
201
203
|
for c in self.columns:
|
|
202
204
|
if c.name == name:
|
|
203
205
|
return c
|
|
206
|
+
return None
|
|
204
207
|
|
|
205
208
|
def search_types(self) -> Set[SearchKey]:
|
|
206
209
|
search_keys = set()
|