upgini 1.2.29a6__py3-none-any.whl → 1.2.29a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +21 -12
- upgini/http.py +1 -1
- upgini/utils/datetime_utils.py +6 -4
- upgini/utils/email_utils.py +3 -2
- upgini/utils/features_validator.py +13 -1
- {upgini-1.2.29a6.dist-info → upgini-1.2.29a7.dist-info}/METADATA +1 -1
- {upgini-1.2.29a6.dist-info → upgini-1.2.29a7.dist-info}/RECORD +10 -10
- {upgini-1.2.29a6.dist-info → upgini-1.2.29a7.dist-info}/WHEEL +1 -1
- {upgini-1.2.29a6.dist-info → upgini-1.2.29a7.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.29a7"
|
upgini/features_enricher.py
CHANGED
|
@@ -54,7 +54,6 @@ from upgini.metadata import (
|
|
|
54
54
|
SYSTEM_RECORD_ID,
|
|
55
55
|
TARGET,
|
|
56
56
|
CVType,
|
|
57
|
-
FeaturesMetadataV2,
|
|
58
57
|
FileColumnMeaningType,
|
|
59
58
|
ModelTaskType,
|
|
60
59
|
RuntimeParameters,
|
|
@@ -1448,7 +1447,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1448
1447
|
client_features = [
|
|
1449
1448
|
c
|
|
1450
1449
|
for c in X_sampled.columns.to_list()
|
|
1451
|
-
if (
|
|
1450
|
+
if (
|
|
1451
|
+
not self.select_features
|
|
1452
|
+
or c in self.feature_names_
|
|
1453
|
+
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1454
|
+
)
|
|
1452
1455
|
and c
|
|
1453
1456
|
not in (
|
|
1454
1457
|
excluding_search_keys
|
|
@@ -1665,7 +1668,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1665
1668
|
generated_features = []
|
|
1666
1669
|
if date_column is not None:
|
|
1667
1670
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1668
|
-
|
|
1671
|
+
# Leave original date column values
|
|
1672
|
+
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1673
|
+
df_with_date_features[date_column] = df[date_column]
|
|
1674
|
+
df = df_with_date_features
|
|
1669
1675
|
generated_features = converter.generated_features
|
|
1670
1676
|
|
|
1671
1677
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1674,9 +1680,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1674
1680
|
df = generator.generate(df)
|
|
1675
1681
|
generated_features.extend(generator.generated_features)
|
|
1676
1682
|
|
|
1677
|
-
normalizer = Normalizer(self.bundle, self.logger)
|
|
1678
|
-
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1679
|
-
columns_renaming = normalizer.columns_renaming
|
|
1683
|
+
# normalizer = Normalizer(self.bundle, self.logger)
|
|
1684
|
+
# df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1685
|
+
# columns_renaming = normalizer.columns_renaming
|
|
1686
|
+
columns_renaming = {c: c for c in df.columns}
|
|
1680
1687
|
|
|
1681
1688
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1682
1689
|
|
|
@@ -2106,7 +2113,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2106
2113
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2107
2114
|
if date_column is not None:
|
|
2108
2115
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2109
|
-
df = converter.convert(df)
|
|
2116
|
+
df = converter.convert(df, keep_time=True)
|
|
2110
2117
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2111
2118
|
generated_features.extend(converter.generated_features)
|
|
2112
2119
|
else:
|
|
@@ -2201,11 +2208,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2201
2208
|
|
|
2202
2209
|
if add_fit_system_record_id:
|
|
2203
2210
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2204
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2205
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2206
2211
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2207
2212
|
features_not_to_pass.append(SORT_ID)
|
|
2208
2213
|
|
|
2214
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2215
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2216
|
+
|
|
2209
2217
|
# search keys might be changed after explode
|
|
2210
2218
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2211
2219
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2224,7 +2232,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2224
2232
|
|
|
2225
2233
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2226
2234
|
|
|
2227
|
-
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2235
|
+
df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
|
|
2228
2236
|
|
|
2229
2237
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2230
2238
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -2339,7 +2347,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2339
2347
|
if c not in self.dropped_client_feature_names_
|
|
2340
2348
|
]
|
|
2341
2349
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2342
|
-
selecting_columns.extend(
|
|
2350
|
+
selecting_columns.extend(
|
|
2351
|
+
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2352
|
+
)
|
|
2343
2353
|
if add_fit_system_record_id:
|
|
2344
2354
|
selecting_columns.append(SORT_ID)
|
|
2345
2355
|
|
|
@@ -3544,7 +3554,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3544
3554
|
):
|
|
3545
3555
|
continue
|
|
3546
3556
|
|
|
3547
|
-
|
|
3548
3557
|
self.feature_names_.append(feature_meta.name)
|
|
3549
3558
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3550
3559
|
|
upgini/http.py
CHANGED
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -114,10 +114,12 @@ class DateTimeSearchKeyConverter:
|
|
|
114
114
|
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
115
|
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
116
|
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
117
|
+
if sin_feature not in df.columns:
|
|
118
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
+
self.generated_features.append(sin_feature)
|
|
120
|
+
if cos_feature not in df.columns:
|
|
121
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
+
self.generated_features.append(cos_feature)
|
|
121
123
|
|
|
122
124
|
df["quarter"] = df[self.date_column].dt.quarter
|
|
123
125
|
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
|
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
if domain_feature not in df.columns:
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
|
43
|
+
self.generated_features.append(domain_feature)
|
|
43
44
|
return df
|
|
44
45
|
|
|
45
46
|
@staticmethod
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
@@ -83,10 +84,21 @@ class FeaturesValidator:
|
|
|
83
84
|
return [
|
|
84
85
|
i
|
|
85
86
|
for i in df
|
|
86
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
87
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
|
|
87
88
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
88
89
|
]
|
|
89
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
+
return (
|
|
94
|
+
is_integer_dtype(series)
|
|
95
|
+
or series.dropna()
|
|
96
|
+
.apply(
|
|
97
|
+
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
+
)
|
|
99
|
+
.all()
|
|
100
|
+
)
|
|
101
|
+
|
|
90
102
|
@staticmethod
|
|
91
103
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
92
104
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=zQC-_yiNcwPq8o3NPpgr0tGKyMXiUXgF1aIDtN0fDEk,25
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=tBdArcifhTLuDIL4D_eRB1gIBt9ayTVU7Ox2fVKE68c,191300
|
|
7
|
+
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
@@ -43,13 +43,13 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
43
43
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
44
44
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
45
45
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
46
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
46
|
+
upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
51
|
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
52
|
-
upgini/utils/features_validator.py,sha256=
|
|
52
|
+
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
54
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
55
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.29a7.dist-info/METADATA,sha256=sE4t490pcKTOegDZx5S7gX4eh9j_pk8zHl5xKe8Qy08,48580
|
|
63
|
+
upgini-1.2.29a7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.29a7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.29a7.dist-info/RECORD,,
|
|
File without changes
|