upgini 1.2.29a5__py3-none-any.whl → 1.2.29a7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +45 -24
- upgini/http.py +1 -1
- upgini/utils/datetime_utils.py +6 -4
- upgini/utils/email_utils.py +3 -2
- upgini/utils/features_validator.py +13 -1
- {upgini-1.2.29a5.dist-info → upgini-1.2.29a7.dist-info}/METADATA +1 -1
- {upgini-1.2.29a5.dist-info → upgini-1.2.29a7.dist-info}/RECORD +10 -10
- {upgini-1.2.29a5.dist-info → upgini-1.2.29a7.dist-info}/WHEEL +1 -1
- {upgini-1.2.29a5.dist-info → upgini-1.2.29a7.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.29a7"
|
upgini/features_enricher.py
CHANGED
|
@@ -2,6 +2,7 @@ import dataclasses
|
|
|
2
2
|
import datetime
|
|
3
3
|
import gc
|
|
4
4
|
import hashlib
|
|
5
|
+
import itertools
|
|
5
6
|
import logging
|
|
6
7
|
import numbers
|
|
7
8
|
import os
|
|
@@ -53,7 +54,6 @@ from upgini.metadata import (
|
|
|
53
54
|
SYSTEM_RECORD_ID,
|
|
54
55
|
TARGET,
|
|
55
56
|
CVType,
|
|
56
|
-
FeaturesMetadataV2,
|
|
57
57
|
FileColumnMeaningType,
|
|
58
58
|
ModelTaskType,
|
|
59
59
|
RuntimeParameters,
|
|
@@ -159,6 +159,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
159
159
|
|
|
160
160
|
shared_datasets: list of str, optional (default=None)
|
|
161
161
|
List of private shared dataset ids for custom search
|
|
162
|
+
|
|
163
|
+
select_features: bool, optional (default=False)
|
|
164
|
+
If True, return only selected features both from input and data sources.
|
|
165
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
162
166
|
"""
|
|
163
167
|
|
|
164
168
|
TARGET_NAME = "target"
|
|
@@ -279,7 +283,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
279
283
|
self._relevant_data_sources_wo_links: pd.DataFrame = self.EMPTY_DATA_SOURCES
|
|
280
284
|
self.metrics: Optional[pd.DataFrame] = None
|
|
281
285
|
self.feature_names_ = []
|
|
282
|
-
self.
|
|
286
|
+
self.dropped_client_feature_names_ = []
|
|
283
287
|
self.feature_importances_ = []
|
|
284
288
|
self.search_id = search_id
|
|
285
289
|
self.select_features = select_features
|
|
@@ -1443,7 +1447,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1443
1447
|
client_features = [
|
|
1444
1448
|
c
|
|
1445
1449
|
for c in X_sampled.columns.to_list()
|
|
1446
|
-
if (
|
|
1450
|
+
if (
|
|
1451
|
+
not self.select_features
|
|
1452
|
+
or c in self.feature_names_
|
|
1453
|
+
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1454
|
+
)
|
|
1447
1455
|
and c
|
|
1448
1456
|
not in (
|
|
1449
1457
|
excluding_search_keys
|
|
@@ -1660,7 +1668,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1660
1668
|
generated_features = []
|
|
1661
1669
|
if date_column is not None:
|
|
1662
1670
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, self.bundle)
|
|
1663
|
-
|
|
1671
|
+
# Leave original date column values
|
|
1672
|
+
df_with_date_features = converter.convert(df, keep_time=True)
|
|
1673
|
+
df_with_date_features[date_column] = df[date_column]
|
|
1674
|
+
df = df_with_date_features
|
|
1664
1675
|
generated_features = converter.generated_features
|
|
1665
1676
|
|
|
1666
1677
|
email_columns = SearchKey.find_all_keys(search_keys, SearchKey.EMAIL)
|
|
@@ -1669,9 +1680,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1669
1680
|
df = generator.generate(df)
|
|
1670
1681
|
generated_features.extend(generator.generated_features)
|
|
1671
1682
|
|
|
1672
|
-
normalizer = Normalizer(self.bundle, self.logger)
|
|
1673
|
-
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1674
|
-
columns_renaming = normalizer.columns_renaming
|
|
1683
|
+
# normalizer = Normalizer(self.bundle, self.logger)
|
|
1684
|
+
# df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1685
|
+
# columns_renaming = normalizer.columns_renaming
|
|
1686
|
+
columns_renaming = {c: c for c in df.columns}
|
|
1675
1687
|
|
|
1676
1688
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1677
1689
|
|
|
@@ -2071,7 +2083,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2071
2083
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2072
2084
|
|
|
2073
2085
|
columns_to_drop = [
|
|
2074
|
-
c for c in validated_X.columns if c in self.feature_names_ and c
|
|
2086
|
+
c for c in validated_X.columns if c in self.feature_names_ and c in self.dropped_client_feature_names_
|
|
2075
2087
|
]
|
|
2076
2088
|
if len(columns_to_drop) > 0:
|
|
2077
2089
|
msg = self.bundle.get("x_contains_enriching_columns").format(columns_to_drop)
|
|
@@ -2101,7 +2113,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2101
2113
|
date_column = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2102
2114
|
if date_column is not None:
|
|
2103
2115
|
converter = DateTimeSearchKeyConverter(date_column, self.date_format, self.logger, bundle=self.bundle)
|
|
2104
|
-
df = converter.convert(df)
|
|
2116
|
+
df = converter.convert(df, keep_time=True)
|
|
2105
2117
|
self.logger.info(f"Date column after convertion: {df[date_column]}")
|
|
2106
2118
|
generated_features.extend(converter.generated_features)
|
|
2107
2119
|
else:
|
|
@@ -2196,11 +2208,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2196
2208
|
|
|
2197
2209
|
if add_fit_system_record_id:
|
|
2198
2210
|
df = self.__add_fit_system_record_id(df, search_keys, SYSTEM_RECORD_ID)
|
|
2199
|
-
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2200
|
-
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2201
2211
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
2202
2212
|
features_not_to_pass.append(SORT_ID)
|
|
2203
2213
|
|
|
2214
|
+
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2215
|
+
df = df.drop(columns=DateTimeSearchKeyConverter.DATETIME_COL)
|
|
2216
|
+
|
|
2204
2217
|
# search keys might be changed after explode
|
|
2205
2218
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + features_for_transform)
|
|
2206
2219
|
df[SYSTEM_RECORD_ID] = pd.util.hash_pandas_object(df[columns_for_system_record_id], index=False).astype(
|
|
@@ -2219,7 +2232,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2219
2232
|
|
|
2220
2233
|
combined_search_keys = combine_search_keys(search_keys.keys())
|
|
2221
2234
|
|
|
2222
|
-
df_without_features = df.drop(columns=features_not_to_pass)
|
|
2235
|
+
df_without_features = df.drop(columns=features_not_to_pass, errors="ignore")
|
|
2223
2236
|
|
|
2224
2237
|
df_without_features, full_duplicates_warning = clean_full_duplicates(
|
|
2225
2238
|
df_without_features, self.logger, bundle=self.bundle
|
|
@@ -2328,11 +2341,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2328
2341
|
else:
|
|
2329
2342
|
result = enrich()
|
|
2330
2343
|
|
|
2344
|
+
selecting_columns = [
|
|
2345
|
+
c
|
|
2346
|
+
for c in itertools.chain(validated_X.columns.tolist(), generated_features)
|
|
2347
|
+
if c not in self.dropped_client_feature_names_
|
|
2348
|
+
]
|
|
2331
2349
|
filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
|
|
2332
|
-
|
|
2350
|
+
selecting_columns.extend(
|
|
2333
2351
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2334
|
-
|
|
2335
|
-
selecting_columns = validated_X.columns.tolist() + generated_features + existing_filtered_columns
|
|
2352
|
+
)
|
|
2336
2353
|
if add_fit_system_record_id:
|
|
2337
2354
|
selecting_columns.append(SORT_ID)
|
|
2338
2355
|
|
|
@@ -3510,7 +3527,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3510
3527
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
3511
3528
|
|
|
3512
3529
|
self.feature_names_ = []
|
|
3513
|
-
self.
|
|
3530
|
+
self.dropped_client_feature_names_ = []
|
|
3514
3531
|
self.feature_importances_ = []
|
|
3515
3532
|
features_info = []
|
|
3516
3533
|
features_info_without_links = []
|
|
@@ -3520,19 +3537,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3520
3537
|
for feature_meta in features_meta:
|
|
3521
3538
|
if feature_meta.name in original_names_dict.keys():
|
|
3522
3539
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3540
|
+
|
|
3541
|
+
is_client_feature = feature_meta.name in x_columns
|
|
3542
|
+
|
|
3543
|
+
if feature_meta.shap_value == 0.0:
|
|
3544
|
+
if self.select_features:
|
|
3545
|
+
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3546
|
+
continue
|
|
3547
|
+
|
|
3523
3548
|
# Use only important features
|
|
3524
3549
|
if (
|
|
3525
|
-
|
|
3526
|
-
or
|
|
3527
|
-
|
|
3550
|
+
feature_meta.name in self.fit_generated_features
|
|
3551
|
+
or feature_meta.name == COUNTRY
|
|
3552
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3553
|
+
or (not self.select_features and is_client_feature)
|
|
3528
3554
|
):
|
|
3529
3555
|
continue
|
|
3530
3556
|
|
|
3531
|
-
is_client_feature = feature_meta.name in x_columns
|
|
3532
|
-
# In select_features mode we select also from etalon features and need to show them
|
|
3533
|
-
if not self.select_features and is_client_feature:
|
|
3534
|
-
continue
|
|
3535
|
-
|
|
3536
3557
|
self.feature_names_.append(feature_meta.name)
|
|
3537
3558
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3538
3559
|
|
upgini/http.py
CHANGED
|
@@ -882,7 +882,7 @@ class _RestClient:
|
|
|
882
882
|
if content_type:
|
|
883
883
|
headers[_RestClient.CONTENT_TYPE_HEADER_NAME] = content_type
|
|
884
884
|
if trace_id:
|
|
885
|
-
headers[_RestClient.TRACE_ID_HEADER_NAME] = trace_id
|
|
885
|
+
headers[_RestClient.TRACE_ID_HEADER_NAME] = str(trace_id)
|
|
886
886
|
for header_key, header_value in additional_headers.items():
|
|
887
887
|
headers[header_key] = header_value
|
|
888
888
|
return headers
|
upgini/utils/datetime_utils.py
CHANGED
|
@@ -114,10 +114,12 @@ class DateTimeSearchKeyConverter:
|
|
|
114
114
|
period_suffix = f"_{period}" if column != "day_in_quarter" else ""
|
|
115
115
|
sin_feature = f"datetime_{column}_sin{period_suffix}"
|
|
116
116
|
cos_feature = f"datetime_{column}_cos{period_suffix}"
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
117
|
+
if sin_feature not in df.columns:
|
|
118
|
+
df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
|
|
119
|
+
self.generated_features.append(sin_feature)
|
|
120
|
+
if cos_feature not in df.columns:
|
|
121
|
+
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
|
122
|
+
self.generated_features.append(cos_feature)
|
|
121
123
|
|
|
122
124
|
df["quarter"] = df[self.date_column].dt.quarter
|
|
123
125
|
|
upgini/utils/email_utils.py
CHANGED
|
@@ -38,8 +38,9 @@ class EmailDomainGenerator:
|
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
39
39
|
for email_col in self.email_columns:
|
|
40
40
|
domain_feature = email_col + self.DOMAIN_SUFFIX
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
if domain_feature not in df.columns:
|
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
|
43
|
+
self.generated_features.append(domain_feature)
|
|
43
44
|
return df
|
|
44
45
|
|
|
45
46
|
@staticmethod
|
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
from logging import Logger
|
|
3
3
|
from typing import Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
|
+
import numpy as np
|
|
5
6
|
import pandas as pd
|
|
6
7
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
7
8
|
|
|
@@ -83,10 +84,21 @@ class FeaturesValidator:
|
|
|
83
84
|
return [
|
|
84
85
|
i
|
|
85
86
|
for i in df
|
|
86
|
-
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or
|
|
87
|
+
if (is_object_dtype(df[i]) or is_string_dtype(df[i]) or FeaturesValidator.__is_integer(df[i]))
|
|
87
88
|
and (df[i].nunique(dropna=False) / row_count >= 0.85)
|
|
88
89
|
]
|
|
89
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def __is_integer(series: pd.Series) -> bool:
|
|
93
|
+
return (
|
|
94
|
+
is_integer_dtype(series)
|
|
95
|
+
or series.dropna()
|
|
96
|
+
.apply(
|
|
97
|
+
lambda f: (float.is_integer(f) and abs(f) < np.iinfo(np.int64).max) if isinstance(f, float) else False
|
|
98
|
+
)
|
|
99
|
+
.all()
|
|
100
|
+
)
|
|
101
|
+
|
|
90
102
|
@staticmethod
|
|
91
103
|
def find_constant_features(df: pd.DataFrame) -> List[str]:
|
|
92
104
|
return [i for i in df if df[i].nunique() <= 1]
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=zQC-_yiNcwPq8o3NPpgr0tGKyMXiUXgF1aIDtN0fDEk,25
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=iPFiMJtk4HF1ytw9wCQr8H9RfoOKj_TIo8XYZKWgcMc,31331
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=tBdArcifhTLuDIL4D_eRB1gIBt9ayTVU7Ox2fVKE68c,191300
|
|
7
|
+
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
10
|
upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
|
|
@@ -43,13 +43,13 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
|
43
43
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
|
44
44
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
45
45
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
46
|
-
upgini/utils/datetime_utils.py,sha256=
|
|
46
|
+
upgini/utils/datetime_utils.py,sha256=F61i2vZCB6eUy4WwodDyPi50XKPbhOHsxDrU6tGa6CM,13133
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
51
|
upgini/utils/feature_info.py,sha256=Tp_2g5-rCjY4NpzKhzxwNxuqH5FFL8vG94OU5kH6wzk,6702
|
|
52
|
-
upgini/utils/features_validator.py,sha256=
|
|
52
|
+
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
53
53
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
54
54
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
55
55
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=PU77nIhTz7IHbC4rpTpxrVxib6cdpRL9F1dhkjIffLY,10225
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.29a7.dist-info/METADATA,sha256=sE4t490pcKTOegDZx5S7gX4eh9j_pk8zHl5xKe8Qy08,48580
|
|
63
|
+
upgini-1.2.29a7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
64
|
+
upgini-1.2.29a7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.29a7.dist-info/RECORD,,
|
|
File without changes
|