upgini 1.2.9__py3-none-any.whl → 1.2.9a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +2 -2
- upgini/features_enricher.py +22 -13
- upgini/metrics.py +1 -5
- upgini/utils/features_validator.py +1 -1
- {upgini-1.2.9.dist-info → upgini-1.2.9a2.dist-info}/METADATA +1 -2
- {upgini-1.2.9.dist-info → upgini-1.2.9a2.dist-info}/RECORD +9 -9
- {upgini-1.2.9.dist-info → upgini-1.2.9a2.dist-info}/WHEEL +0 -0
- {upgini-1.2.9.dist-info → upgini-1.2.9a2.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.9a2"
|
upgini/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from upgini.features_enricher import FeaturesEnricher # noqa: F401
|
|
4
|
-
from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
|
|
3
|
+
# from upgini.features_enricher import FeaturesEnricher # noqa: F401
|
|
4
|
+
# from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
|
|
5
5
|
# from .lazy_import import LazyImport
|
|
6
6
|
|
|
7
7
|
os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
|
upgini/features_enricher.py
CHANGED
|
@@ -1103,7 +1103,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1103
1103
|
else:
|
|
1104
1104
|
eval_uplift = None
|
|
1105
1105
|
|
|
1106
|
-
|
|
1106
|
+
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
1107
1107
|
eval_metrics = {
|
|
1108
1108
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1109
1109
|
"quality_metrics_eval_segment"
|
|
@@ -1369,7 +1369,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1369
1369
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
|
1370
1370
|
)
|
|
1371
1371
|
]
|
|
1372
|
-
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
|
1373
1372
|
|
|
1374
1373
|
filtered_enriched_features = self.__filtered_enriched_features(
|
|
1375
1374
|
importance_threshold,
|
|
@@ -1436,19 +1435,31 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1436
1435
|
)
|
|
1437
1436
|
|
|
1438
1437
|
fitting_eval_set_dict = {}
|
|
1439
|
-
fitting_x_columns = fitting_X.columns.to_list()
|
|
1440
|
-
self.logger.info(f"Final list of fitting X columns: {fitting_x_columns}")
|
|
1441
|
-
fitting_enriched_x_columns = fitting_enriched_X.columns.to_list()
|
|
1442
|
-
self.logger.info(f"Final list of fitting enriched X columns: {fitting_enriched_x_columns}")
|
|
1443
1438
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1444
1439
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
1445
1440
|
eval_X_sorted, eval_y_sorted = self._sort_by_system_record_id(eval_X_sampled, eval_y_sampled, self.cv)
|
|
1446
1441
|
enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
|
|
1447
1442
|
enriched_eval_X, eval_y_sampled, self.cv
|
|
1448
1443
|
)
|
|
1449
|
-
fitting_eval_X = eval_X_sorted[
|
|
1450
|
-
fitting_enriched_eval_X = enriched_eval_X_sorted[
|
|
1451
|
-
|
|
1444
|
+
fitting_eval_X = eval_X_sorted[client_features].copy()
|
|
1445
|
+
fitting_enriched_eval_X = enriched_eval_X_sorted[
|
|
1446
|
+
client_features + existing_filtered_enriched_features
|
|
1447
|
+
].copy()
|
|
1448
|
+
|
|
1449
|
+
# # Drop high cardinality features in eval set
|
|
1450
|
+
if len(columns_with_high_cardinality) > 0:
|
|
1451
|
+
fitting_eval_X = fitting_eval_X.drop(columns=columns_with_high_cardinality, errors="ignore")
|
|
1452
|
+
fitting_enriched_eval_X = fitting_enriched_eval_X.drop(
|
|
1453
|
+
columns=columns_with_high_cardinality, errors="ignore"
|
|
1454
|
+
)
|
|
1455
|
+
# Drop constant features in eval_set
|
|
1456
|
+
if len(constant_columns) > 0:
|
|
1457
|
+
fitting_eval_X = fitting_eval_X.drop(columns=constant_columns, errors="ignore")
|
|
1458
|
+
fitting_enriched_eval_X = fitting_enriched_eval_X.drop(columns=constant_columns, errors="ignore")
|
|
1459
|
+
# Drop datetime features in eval_set
|
|
1460
|
+
if len(datetime_features) > 0:
|
|
1461
|
+
fitting_eval_X = fitting_eval_X.drop(columns=datetime_features, errors="ignore")
|
|
1462
|
+
fitting_enriched_eval_X = fitting_enriched_eval_X.drop(columns=datetime_features, errors="ignore")
|
|
1452
1463
|
# Convert bool to string in eval_set
|
|
1453
1464
|
if len(bool_columns) > 0:
|
|
1454
1465
|
fitting_eval_X[col] = fitting_eval_X[col].astype(str)
|
|
@@ -1669,7 +1680,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1669
1680
|
X_sampled = enriched_Xy[x_columns].copy()
|
|
1670
1681
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1671
1682
|
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
|
|
1672
|
-
enriched_X_columns = enriched_X.columns.to_list()
|
|
1673
1683
|
|
|
1674
1684
|
self.logger.info(f"Shape of enriched_X: {enriched_X.shape}")
|
|
1675
1685
|
self.logger.info(f"Shape of X after sampling: {X_sampled.shape}")
|
|
@@ -1684,7 +1694,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1684
1694
|
for idx in range(len(eval_set)):
|
|
1685
1695
|
eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
|
|
1686
1696
|
eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
|
|
1687
|
-
enriched_eval_X = enriched_eval_sets[idx + 1][
|
|
1697
|
+
enriched_eval_X = enriched_eval_sets[idx + 1].drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1688
1698
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
1689
1699
|
|
|
1690
1700
|
self.__cached_sampled_datasets = (
|
|
@@ -1763,13 +1773,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1763
1773
|
X_sampled = enriched_Xy[x_columns].copy()
|
|
1764
1774
|
y_sampled = enriched_Xy[TARGET].copy()
|
|
1765
1775
|
enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1766
|
-
enriched_X_columns = enriched_X.columns.tolist()
|
|
1767
1776
|
|
|
1768
1777
|
for idx in range(len(eval_set)):
|
|
1769
1778
|
enriched_eval_xy = enriched_df.query(f"{EVAL_SET_INDEX} == {idx + 1}")
|
|
1770
1779
|
eval_x_sampled = enriched_eval_xy[x_columns].copy()
|
|
1771
1780
|
eval_y_sampled = enriched_eval_xy[TARGET].copy()
|
|
1772
|
-
enriched_eval_x = enriched_eval_xy[
|
|
1781
|
+
enriched_eval_x = enriched_eval_xy.drop(columns=[TARGET, EVAL_SET_INDEX])
|
|
1773
1782
|
eval_set_sampled_dict[idx] = (eval_x_sampled, enriched_eval_x, eval_y_sampled)
|
|
1774
1783
|
else:
|
|
1775
1784
|
self.logger.info("Transform without eval_set")
|
upgini/metrics.py
CHANGED
|
@@ -10,7 +10,6 @@ import catboost
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
|
-
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
14
13
|
from numpy import log1p
|
|
15
14
|
from pandas.api.types import is_numeric_dtype
|
|
16
15
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -254,7 +253,6 @@ class EstimatorWrapper:
|
|
|
254
253
|
def _prepare_data(
|
|
255
254
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
256
255
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
257
|
-
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
|
258
256
|
for c in x.columns:
|
|
259
257
|
if is_numeric_dtype(x[c]):
|
|
260
258
|
x[c] = x[c].astype(float)
|
|
@@ -273,7 +271,6 @@ class EstimatorWrapper:
|
|
|
273
271
|
else:
|
|
274
272
|
x, y = self._remove_empty_target_rows(x, y)
|
|
275
273
|
|
|
276
|
-
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
277
274
|
return x, y, groups
|
|
278
275
|
|
|
279
276
|
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
@@ -408,6 +405,7 @@ class EstimatorWrapper:
|
|
|
408
405
|
estimator = CatBoostWrapper(**kwargs)
|
|
409
406
|
else:
|
|
410
407
|
try:
|
|
408
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
411
409
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
412
410
|
estimator = LightGBMWrapper(**kwargs)
|
|
413
411
|
else:
|
|
@@ -495,9 +493,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
495
493
|
if x[name].nunique() > 1:
|
|
496
494
|
unique_cat_features.append(name)
|
|
497
495
|
else:
|
|
498
|
-
self.logger.info(f"Drop column {name} on preparing data for fit")
|
|
499
496
|
x = x.drop(columns=name)
|
|
500
|
-
self.exclude_features.append(name)
|
|
501
497
|
self.cat_features = unique_cat_features
|
|
502
498
|
if (
|
|
503
499
|
hasattr(self.estimator, "get_param")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.9a2
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -28,7 +28,6 @@ Requires-Dist: fastparquet>=0.8.1
|
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
|
-
Requires-Dist: lightgbm>=3.3.2
|
|
32
31
|
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
33
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
2
|
-
upgini/__init__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=GR-gR128AJNWooDSfagbh8bkcWGBGJA-QDxLGQEYeqQ,24
|
|
2
|
+
upgini/__init__.py,sha256=3WLf0J2JF5xhTYOKbReBzkRmmvXTl_V7JKZm7b-cpTo,593
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=J5-bn07iaJSXVN8COeu2RbDqZ4NTPd1L27HePaNh52o,188134
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=AYVvcqSqO_UWwFIby0gcqSDNLiIoy6EU3pa8aUBUQ4k,30946
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
@@ -47,7 +47,7 @@ upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwt
|
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
|
-
upgini/utils/features_validator.py,sha256=
|
|
50
|
+
upgini/utils/features_validator.py,sha256=PgKNt5dyqfErTvjtRNNUS9g7GFqHBtAtnsfA-V5UO1A,3307
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
52
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
53
53
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.9a2.dist-info/METADATA,sha256=NepU5uIYzESWv2GCSJrd6W3GOd4m2ipgrZChnFdiJTM,48578
|
|
61
|
+
upgini-1.2.9a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.9a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.9a2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|