upgini 1.2.9a110__py3-none-any.whl → 1.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +15 -10
- upgini/metrics.py +2 -1
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/features_validator.py +12 -3
- {upgini-1.2.9a110.dist-info → upgini-1.2.11.dist-info}/METADATA +1 -2
- {upgini-1.2.9a110.dist-info → upgini-1.2.11.dist-info}/RECORD +9 -9
- {upgini-1.2.9a110.dist-info → upgini-1.2.11.dist-info}/WHEEL +0 -0
- {upgini-1.2.9a110.dist-info → upgini-1.2.11.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.11"
|
upgini/features_enricher.py
CHANGED
|
@@ -1577,7 +1577,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1577
1577
|
df = generator.generate(df)
|
|
1578
1578
|
generated_features.extend(generator.generated_features)
|
|
1579
1579
|
|
|
1580
|
-
normalizer = Normalizer(
|
|
1580
|
+
normalizer = Normalizer(search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1581
1581
|
df = normalizer.normalize(df)
|
|
1582
1582
|
columns_renaming = normalizer.columns_renaming
|
|
1583
1583
|
|
|
@@ -1633,10 +1633,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1633
1633
|
|
|
1634
1634
|
rows_to_drop = None
|
|
1635
1635
|
has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
|
|
1636
|
-
|
|
1636
|
+
self.model_task_type = self.model_task_type or define_task(
|
|
1637
1637
|
self.df_with_original_index[TARGET], has_date, self.logger, silent=True
|
|
1638
1638
|
)
|
|
1639
|
-
if
|
|
1639
|
+
if self.model_task_type == ModelTaskType.REGRESSION:
|
|
1640
1640
|
target_outliers_df = self._search_task.get_target_outliers(trace_id)
|
|
1641
1641
|
if target_outliers_df is not None and len(target_outliers_df) > 0:
|
|
1642
1642
|
outliers = pd.merge(
|
|
@@ -2391,12 +2391,12 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2391
2391
|
|
|
2392
2392
|
maybe_date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2393
2393
|
has_date = maybe_date_column is not None
|
|
2394
|
-
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2394
|
+
self.model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2395
2395
|
|
|
2396
|
-
self._validate_binary_observations(validated_y, model_task_type)
|
|
2396
|
+
self._validate_binary_observations(validated_y, self.model_task_type)
|
|
2397
2397
|
|
|
2398
2398
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
2399
|
-
self.loss, model_task_type, self.runtime_parameters, self.logger
|
|
2399
|
+
self.loss, self.model_task_type, self.runtime_parameters, self.logger
|
|
2400
2400
|
)
|
|
2401
2401
|
|
|
2402
2402
|
if validated_eval_set is not None and len(validated_eval_set) > 0:
|
|
@@ -2449,7 +2449,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2449
2449
|
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2450
2450
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2451
2451
|
|
|
2452
|
-
self.__adjust_cv(df, maybe_date_column, model_task_type)
|
|
2452
|
+
self.__adjust_cv(df, maybe_date_column, self.model_task_type)
|
|
2453
2453
|
|
|
2454
2454
|
normalizer = Normalizer(
|
|
2455
2455
|
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
@@ -2522,7 +2522,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2522
2522
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2523
2523
|
|
|
2524
2524
|
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2525
|
-
df, features_columns, self.generate_features, self.warning_counter
|
|
2525
|
+
df, features_columns, self.generate_features, self.warning_counter, columns_renaming
|
|
2526
2526
|
)
|
|
2527
2527
|
self.fit_dropped_features.update(features_to_drop)
|
|
2528
2528
|
df = df.drop(columns=features_to_drop)
|
|
@@ -2557,7 +2557,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2557
2557
|
meaning_types=meaning_types,
|
|
2558
2558
|
search_keys=combined_search_keys,
|
|
2559
2559
|
unnest_search_keys=unnest_search_keys,
|
|
2560
|
-
model_task_type=model_task_type,
|
|
2560
|
+
model_task_type=self.model_task_type,
|
|
2561
2561
|
date_format=self.date_format,
|
|
2562
2562
|
random_state=self.random_state,
|
|
2563
2563
|
rest_client=self.rest_client,
|
|
@@ -2780,6 +2780,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2780
2780
|
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(EVAL_SET_INDEX))
|
|
2781
2781
|
if SYSTEM_RECORD_ID in validated_X.columns:
|
|
2782
2782
|
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(SYSTEM_RECORD_ID))
|
|
2783
|
+
if ENTITY_SYSTEM_RECORD_ID in validated_X.columns:
|
|
2784
|
+
raise ValidationError(self.bundle.get("x_contains_reserved_column_name").format(ENTITY_SYSTEM_RECORD_ID))
|
|
2783
2785
|
|
|
2784
2786
|
return validated_X
|
|
2785
2787
|
|
|
@@ -3760,7 +3762,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3760
3762
|
display_html_dataframe(self.metrics, self.metrics, msg)
|
|
3761
3763
|
|
|
3762
3764
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
|
3763
|
-
|
|
3765
|
+
search_key_names = search_keys.keys()
|
|
3766
|
+
if self.fit_columns_renaming:
|
|
3767
|
+
search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
|
|
3768
|
+
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
|
3764
3769
|
|
|
3765
3770
|
try:
|
|
3766
3771
|
_ = get_ipython() # type: ignore
|
upgini/metrics.py
CHANGED
|
@@ -10,7 +10,6 @@ import catboost
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import pandas as pd
|
|
12
12
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
|
-
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
14
13
|
from numpy import log1p
|
|
15
14
|
from pandas.api.types import is_numeric_dtype
|
|
16
15
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -408,6 +407,8 @@ class EstimatorWrapper:
|
|
|
408
407
|
estimator = CatBoostWrapper(**kwargs)
|
|
409
408
|
else:
|
|
410
409
|
try:
|
|
410
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
411
|
+
|
|
411
412
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
412
413
|
estimator = LightGBMWrapper(**kwargs)
|
|
413
414
|
else:
|
|
@@ -190,7 +190,7 @@ ads_upload_too_few_rows=At least 1000 records per sample are needed. Increase th
|
|
|
190
190
|
ads_upload_search_key_not_found=Search key {} wasn't found in dataframe columns
|
|
191
191
|
ads_upload_to_many_empty_rows=More than 50% of rows in the submitted sample doesn't contain valid keys\nPlease fill the key columns with valid values and resubmit the data
|
|
192
192
|
# Features info warning
|
|
193
|
-
features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats
|
|
193
|
+
features_info_zero_important_features=Oops, we can't find any relevant external features for your training dataset,\nmost probably due to issues with search keys formats.\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
194
194
|
features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
|
|
195
195
|
features_not_generated=\nWARNING: Following features didn't pass checks for automated feature generation: {}
|
|
196
196
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from logging import Logger
|
|
3
|
-
from typing import List, Optional
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas.api.types import is_integer_dtype, is_object_dtype, is_string_dtype
|
|
@@ -23,6 +23,7 @@ class FeaturesValidator:
|
|
|
23
23
|
features: List[str],
|
|
24
24
|
features_for_generate: Optional[List[str]],
|
|
25
25
|
warning_counter: WarningCounter,
|
|
26
|
+
columns_renaming: Optional[Dict[str, str]] = None,
|
|
26
27
|
) -> List[str]:
|
|
27
28
|
# one_hot_encoded_features = []
|
|
28
29
|
empty_or_constant_features = []
|
|
@@ -56,7 +57,11 @@ class FeaturesValidator:
|
|
|
56
57
|
# warning_counter.increment()
|
|
57
58
|
|
|
58
59
|
if empty_or_constant_features:
|
|
59
|
-
|
|
60
|
+
if columns_renaming:
|
|
61
|
+
display_names = [columns_renaming.get(f, f) for f in empty_or_constant_features]
|
|
62
|
+
else:
|
|
63
|
+
display_names = empty_or_constant_features
|
|
64
|
+
msg = bundle.get("empty_or_contant_features").format(display_names)
|
|
60
65
|
print(msg)
|
|
61
66
|
self.logger.warning(msg)
|
|
62
67
|
warning_counter.increment()
|
|
@@ -65,7 +70,11 @@ class FeaturesValidator:
|
|
|
65
70
|
if features_for_generate:
|
|
66
71
|
high_cardinality_features = [f for f in high_cardinality_features if f not in features_for_generate]
|
|
67
72
|
if high_cardinality_features:
|
|
68
|
-
|
|
73
|
+
if columns_renaming:
|
|
74
|
+
display_names = [columns_renaming.get(f, f) for f in high_cardinality_features]
|
|
75
|
+
else:
|
|
76
|
+
display_names = empty_or_constant_features
|
|
77
|
+
msg = bundle.get("high_cardinality_features").format(display_names)
|
|
69
78
|
print(msg)
|
|
70
79
|
self.logger.warning(msg)
|
|
71
80
|
warning_counter.increment()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.11
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -28,7 +28,6 @@ Requires-Dist: fastparquet>=0.8.1
|
|
|
28
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
|
-
Requires-Dist: lightgbm>=3.3.2
|
|
32
31
|
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
33
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
34
33
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=aBWZsCYiXXcSUsUJr3tOTQWsH7ZDqJzyMYdQbOd5Qtc,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=eRkI2qpV-IprB1dQAMxzto6I6Q3b3SBuDMVR1_OFlyA,188008
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=aKJwAYUGNRdiz9z-bxDxs4jGZQ_VkPXa7sZ52C0VpVI,31243
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=ddSKUK_-eGJB3NgrqOMoWJU-OxQ253WsNLp8aqJkaIM,1389
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=bHRPWCNrUvt2R9qMX6dZFCJ0i8ENVCQ2Rw3dHH9IJEg,7447
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=faj0wJHppGTKCTbXW8KjqLuGyFNjgb5evEMeSrq_LCE,26460
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -47,7 +47,7 @@ upgini/utils/deduplicate_utils.py,sha256=Zvs7zW4QzaERQmJNPrTVf2ZTVBkBLOycFCzyMwt
|
|
|
47
47
|
upgini/utils/display_utils.py,sha256=A2ouB5eiZ-Kyt9ykYxkLQwyoRPrdYeJymwNTiajtFXs,10990
|
|
48
48
|
upgini/utils/email_utils.py,sha256=j0Ug1R_0AnCg1Y92zIZ4XMwvKo3G5_pcOlBN1OH_gZs,5191
|
|
49
49
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
50
|
-
upgini/utils/features_validator.py,sha256=
|
|
50
|
+
upgini/utils/features_validator.py,sha256=lf5Z-taTl98p7nAWQIyM0dUfkodbzjxv0mOSIZl1jRU,3760
|
|
51
51
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
52
52
|
upgini/utils/ip_utils.py,sha256=Q6vb7Sr5Khx3Sq3eENjW2qCXKej_S5jZbneH6zEOkzQ,5171
|
|
53
53
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.11.dist-info/METADATA,sha256=DKUOOrexxVQVXzyaD9sXsPyT8VYx2eys3oKt15nVGtI,48577
|
|
61
|
+
upgini-1.2.11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
62
|
+
upgini-1.2.11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|