upgini 1.2.86.dev1__py3-none-any.whl → 1.2.87a3857.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +43 -5
- upgini/metrics.py +12 -13
- upgini/utils/datetime_utils.py +17 -17
- upgini/utils/email_utils.py +5 -5
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87a3857.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87a3857.dev1.dist-info}/RECORD +9 -9
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87a3857.dev1.dist-info}/WHEEL +1 -1
- {upgini-1.2.86.dev1.dist-info → upgini-1.2.87a3857.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.87a3857.dev1"
|
upgini/features_enricher.py
CHANGED
@@ -1664,6 +1664,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
1664
1664
|
enriched_eval_y_sorted,
|
1665
1665
|
)
|
1666
1666
|
|
1667
|
+
fitting_X, fitting_enriched_X, fitting_eval_set_dict = self._convert_id_columns_to_int(
|
1668
|
+
fitting_X, fitting_enriched_X, fitting_eval_set_dict, columns_renaming
|
1669
|
+
)
|
1670
|
+
|
1667
1671
|
return (
|
1668
1672
|
validated_X,
|
1669
1673
|
fitting_X,
|
@@ -1677,6 +1681,38 @@ class FeaturesEnricher(TransformerMixin):
|
|
1677
1681
|
columns_renaming,
|
1678
1682
|
)
|
1679
1683
|
|
1684
|
+
def _convert_id_columns_to_int(
|
1685
|
+
self,
|
1686
|
+
fitting_X: pd.DataFrame,
|
1687
|
+
fitting_enriched_X: pd.DataFrame,
|
1688
|
+
fitting_eval_set_dict: Dict[int, Tuple[pd.DataFrame, pd.Series]],
|
1689
|
+
columns_renaming: Dict[str, str] = {},
|
1690
|
+
) -> pd.DataFrame:
|
1691
|
+
def _set_encoded(col_name: str, df: pd.DataFrame, slice: Tuple[int, int], combined_col: pd.Series):
|
1692
|
+
df[col_name] = combined_col.iloc[slice[0] : slice[1]]
|
1693
|
+
return slice[1]
|
1694
|
+
|
1695
|
+
inverse_columns_renaming = {v: k for k, v in columns_renaming.items()}
|
1696
|
+
|
1697
|
+
if self.id_columns:
|
1698
|
+
self.logger.info(f"Convert id columns to int: {self.id_columns}")
|
1699
|
+
for col in self.id_columns:
|
1700
|
+
col = inverse_columns_renaming.get(col, col)
|
1701
|
+
combined_col = pd.concat(
|
1702
|
+
[fitting_X[col], fitting_enriched_X[col]]
|
1703
|
+
+ [eval_set_pair[0][col] for eval_set_pair in fitting_eval_set_dict.values()]
|
1704
|
+
)
|
1705
|
+
combined_col = combined_col.astype("category").cat.codes
|
1706
|
+
slice_end = _set_encoded(col, fitting_X, (0, len(fitting_X)), combined_col)
|
1707
|
+
slice_end = _set_encoded(
|
1708
|
+
col, fitting_enriched_X, (slice_end, slice_end + len(fitting_enriched_X)), combined_col
|
1709
|
+
)
|
1710
|
+
for eval_set_pair in fitting_eval_set_dict.values():
|
1711
|
+
slice_end = _set_encoded(
|
1712
|
+
col, eval_set_pair[0], (slice_end, slice_end + len(eval_set_pair[0])), combined_col
|
1713
|
+
)
|
1714
|
+
return fitting_X, fitting_enriched_X, fitting_eval_set_dict
|
1715
|
+
|
1680
1716
|
@dataclass
|
1681
1717
|
class _SampledDataForMetrics:
|
1682
1718
|
X_sampled: pd.DataFrame
|
@@ -2204,10 +2240,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
2204
2240
|
{"name": name, "value": key_example(sk_type)} for name in sk_meta.unnestKeyNames
|
2205
2241
|
]
|
2206
2242
|
else:
|
2207
|
-
search_keys_with_values[sk_type.name] = [
|
2208
|
-
|
2209
|
-
|
2210
|
-
|
2243
|
+
search_keys_with_values[sk_type.name] = [
|
2244
|
+
{
|
2245
|
+
"name": sk_meta.originalName,
|
2246
|
+
"value": key_example(sk_type),
|
2247
|
+
}
|
2248
|
+
]
|
2211
2249
|
|
2212
2250
|
keys_section = json.dumps(search_keys_with_values)
|
2213
2251
|
features_for_transform = self._search_task.get_features_for_transform()
|
@@ -3927,7 +3965,7 @@ if response.status_code == 200:
|
|
3927
3965
|
if features_meta is None:
|
3928
3966
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3929
3967
|
|
3930
|
-
return [f.name for f in features_meta if f.type == "categorical"]
|
3968
|
+
return [f.name for f in features_meta if f.type == "categorical" and f.name not in self.id_columns]
|
3931
3969
|
|
3932
3970
|
def __prepare_feature_importances(
|
3933
3971
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
upgini/metrics.py
CHANGED
@@ -314,7 +314,7 @@ class EstimatorWrapper:
|
|
314
314
|
self.groups = groups
|
315
315
|
self.text_features = text_features
|
316
316
|
self.logger = logger or logging.getLogger()
|
317
|
-
self.
|
317
|
+
self.dropped_features = []
|
318
318
|
self.converted_to_int = []
|
319
319
|
self.converted_to_str = []
|
320
320
|
self.converted_to_numeric = []
|
@@ -363,10 +363,11 @@ class EstimatorWrapper:
|
|
363
363
|
x, y, groups = self._prepare_data(x, y, groups=self.groups)
|
364
364
|
|
365
365
|
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
366
|
-
self.
|
366
|
+
self.dropped_features = []
|
367
367
|
self.converted_to_int = []
|
368
368
|
self.converted_to_str = []
|
369
369
|
self.converted_to_numeric = []
|
370
|
+
|
370
371
|
for c in x.columns:
|
371
372
|
|
372
373
|
if _get_unique_count(x[c]) < 2:
|
@@ -374,7 +375,7 @@ class EstimatorWrapper:
|
|
374
375
|
if c in self.cat_features:
|
375
376
|
self.cat_features.remove(c)
|
376
377
|
x.drop(columns=[c], inplace=True)
|
377
|
-
self.
|
378
|
+
self.dropped_features.append(c)
|
378
379
|
elif self.text_features is not None and c in self.text_features:
|
379
380
|
x[c] = x[c].astype(str)
|
380
381
|
self.converted_to_str.append(c)
|
@@ -391,9 +392,7 @@ class EstimatorWrapper:
|
|
391
392
|
self.converted_to_int.append(c)
|
392
393
|
self.cat_features.remove(c)
|
393
394
|
elif is_float_dtype(x[c]) or (x[c].dtype == "category" and is_float_dtype(x[c].cat.categories)):
|
394
|
-
self.logger.info(
|
395
|
-
f"Convert float cat feature {c} to string"
|
396
|
-
)
|
395
|
+
self.logger.info(f"Convert float cat feature {c} to string")
|
397
396
|
x[c] = x[c].astype(str)
|
398
397
|
self.converted_to_str.append(c)
|
399
398
|
elif x[c].dtype not in ["category", "int64"]:
|
@@ -411,16 +410,16 @@ class EstimatorWrapper:
|
|
411
410
|
except (ValueError, TypeError):
|
412
411
|
self.logger.warning(f"Remove feature {c} because it is not numeric and not in cat_features")
|
413
412
|
x.drop(columns=[c], inplace=True)
|
414
|
-
self.
|
413
|
+
self.dropped_features.append(c)
|
415
414
|
|
416
415
|
return x, y, groups, {}
|
417
416
|
|
418
417
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
419
418
|
x, y, _ = self._prepare_data(x, y)
|
420
419
|
|
421
|
-
if self.
|
422
|
-
self.logger.info(f"Drop features on calculate metrics: {self.
|
423
|
-
x = x.drop(columns=self.
|
420
|
+
if self.dropped_features:
|
421
|
+
self.logger.info(f"Drop features on calculate metrics: {self.dropped_features}")
|
422
|
+
x = x.drop(columns=self.dropped_features)
|
424
423
|
|
425
424
|
if self.converted_to_int:
|
426
425
|
self.logger.info(f"Convert to int features on calculate metrics: {self.converted_to_int}")
|
@@ -763,7 +762,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
763
762
|
)
|
764
763
|
for f in high_cardinality_features:
|
765
764
|
self.text_features.remove(f)
|
766
|
-
self.
|
765
|
+
self.dropped_features.append(f)
|
767
766
|
x = x.drop(columns=f, errors="ignore")
|
768
767
|
return super().cross_val_predict(x, y, baseline_score_column)
|
769
768
|
else:
|
@@ -853,7 +852,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
853
852
|
for c in x.columns:
|
854
853
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
855
854
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
856
|
-
self.
|
855
|
+
self.dropped_features.append(c)
|
857
856
|
x = x.drop(columns=c, errors="ignore")
|
858
857
|
return x, y_numpy, groups, params
|
859
858
|
|
@@ -933,7 +932,7 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
933
932
|
for c in x.columns:
|
934
933
|
if x[c].dtype not in ["category", "int64", "float64", "bool"]:
|
935
934
|
self.logger.warning(f"Feature {c} is not numeric and will be dropped")
|
936
|
-
self.
|
935
|
+
self.dropped_features.append(c)
|
937
936
|
x = x.drop(columns=c, errors="ignore")
|
938
937
|
return x, y_numpy, groups, params
|
939
938
|
|
upgini/utils/datetime_utils.py
CHANGED
@@ -121,31 +121,31 @@ class DateTimeSearchKeyConverter:
|
|
121
121
|
df[cos_feature] = np.cos(2 * np.pi * df[column] / period)
|
122
122
|
self.generated_features.append(cos_feature)
|
123
123
|
|
124
|
-
|
124
|
+
df["quarter"] = df[self.date_column].dt.quarter
|
125
125
|
|
126
|
-
#
|
127
|
-
|
126
|
+
# Calculate the start date of the quarter for each timestamp
|
127
|
+
df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
|
128
128
|
|
129
|
-
#
|
130
|
-
|
129
|
+
# Calculate the day in the quarter
|
130
|
+
df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
|
131
131
|
|
132
|
-
#
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
132
|
+
# Vectorized calculation of days_in_quarter
|
133
|
+
quarter = df["quarter"]
|
134
|
+
start = df["quarter_start"]
|
135
|
+
year = start.dt.year
|
136
|
+
month = start.dt.month
|
137
137
|
|
138
|
-
|
139
|
-
|
138
|
+
quarter_end_year = np.where(quarter == 4, year + 1, year)
|
139
|
+
quarter_end_month = np.where(quarter == 4, 1, month + 3)
|
140
140
|
|
141
|
-
|
142
|
-
|
141
|
+
end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
|
142
|
+
end.index = df.index
|
143
143
|
|
144
|
-
|
144
|
+
df["days_in_quarter"] = (end - start).dt.days
|
145
145
|
|
146
|
-
|
146
|
+
add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
|
147
147
|
|
148
|
-
|
148
|
+
df.drop(columns=["quarter", "quarter_start", "day_in_quarter", "days_in_quarter"], inplace=True)
|
149
149
|
|
150
150
|
df[seconds] = (df[self.date_column] - df[self.date_column].dt.floor("D")).dt.seconds
|
151
151
|
|
upgini/utils/email_utils.py
CHANGED
@@ -36,11 +36,11 @@ class EmailDomainGenerator:
|
|
36
36
|
self.generated_features = []
|
37
37
|
|
38
38
|
def generate(self, df: pd.DataFrame) -> pd.DataFrame:
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
for email_col in self.email_columns:
|
40
|
+
domain_feature = email_col + self.DOMAIN_SUFFIX
|
41
|
+
if domain_feature not in df.columns:
|
42
|
+
df[domain_feature] = df[email_col].apply(self._email_to_domain).astype("string")
|
43
|
+
self.generated_features.append(domain_feature)
|
44
44
|
return df
|
45
45
|
|
46
46
|
@staticmethod
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.87a3857.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=3sSsl0Y82MTrFi1HkJZNIy6czqNHsAzjhFjDu3_mdew,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=G69F0tRPjKWMhNwvXK0dgHzyTSMHShGN0ycrtYge6kA,215354
|
7
7
|
upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
|
8
8
|
upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=HJ5DpnrWAwrlw3_JlAWEhs1SXfI-_R4TGp2ajavOE14,43129
|
10
10
|
upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -51,10 +51,10 @@ upgini/utils/blocked_time_series.py,sha256=Uqr3vp4YqNclj2-PzEYqVy763GSXHn86sbpIl
|
|
51
51
|
upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
|
52
52
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
53
53
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
54
|
-
upgini/utils/datetime_utils.py,sha256=
|
54
|
+
upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
|
55
55
|
upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
|
56
56
|
upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
|
57
|
-
upgini/utils/email_utils.py,sha256=
|
57
|
+
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
58
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
59
59
|
upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.87a3857.dev1.dist-info/METADATA,sha256=CAuubJghDMbrQnw0lkf8Go-cHO0uvsYaQYVel40FlLM,49172
|
74
|
+
upgini-1.2.87a3857.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
75
|
+
upgini-1.2.87a3857.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.87a3857.dev1.dist-info/RECORD,,
|
File without changes
|