upgini 1.1.127__tar.gz → 1.1.129__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.127/src/upgini.egg-info → upgini-1.1.129}/PKG-INFO +2 -2
- {upgini-1.1.127 → upgini-1.1.129}/README.md +1 -1
- {upgini-1.1.127 → upgini-1.1.129}/setup.py +1 -1
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/features_enricher.py +31 -11
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/metrics.py +19 -20
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/features_validator.py +11 -1
- {upgini-1.1.127 → upgini-1.1.129/src/upgini.egg-info}/PKG-INFO +2 -2
- {upgini-1.1.127 → upgini-1.1.129}/LICENSE +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/pyproject.toml +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/setup.cfg +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/ads.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/dataset.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/errors.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/http.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/metadata.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/search_task.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/spinner.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_country_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_email_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_metrics.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.127 → upgini-1.1.129}/tests/test_postal_code_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.129
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -66,7 +66,7 @@ License-File: LICENSE
|
|
|
66
66
|
## 🚀 Awesome features
|
|
67
67
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
68
68
|
⭐️ Data source optimizations for ML tasks to *"squeeze" maximum information for models* out of the source data: automated feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
69
|
-
⭐️
|
|
69
|
+
⭐️ *Automatic search key augmentation* from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
70
70
|
⭐️ Calculate *accuracy metrics and uplifts* after enrichment existing ML model with external features
|
|
71
71
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
72
72
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
## 🚀 Awesome features
|
|
37
37
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
38
38
|
⭐️ Data source optimizations for ML tasks to *"squeeze" maximum information for models* out of the source data: automated feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
39
|
-
⭐️
|
|
39
|
+
⭐️ *Automatic search key augmentation* from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
40
40
|
⭐️ Calculate *accuracy metrics and uplifts* after enrichment existing ML model with external features
|
|
41
41
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
42
42
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
@@ -53,7 +53,6 @@ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter, is_time_seri
|
|
|
53
53
|
from upgini.utils.display_utils import (
|
|
54
54
|
display_html_dataframe,
|
|
55
55
|
do_without_pandas_limits,
|
|
56
|
-
ipython_available,
|
|
57
56
|
)
|
|
58
57
|
from upgini.utils.email_utils import EmailSearchKeyConverter, EmailSearchKeyDetector
|
|
59
58
|
from upgini.utils.features_validator import FeaturesValidator
|
|
@@ -576,13 +575,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
576
575
|
finally:
|
|
577
576
|
self.logger.info(f"Transform elapsed time: {time.time() - start_time}")
|
|
578
577
|
|
|
579
|
-
if self.country_added
|
|
580
|
-
result = result
|
|
578
|
+
if self.country_added:
|
|
579
|
+
result = drop_existing_columns(result, COUNTRY)
|
|
581
580
|
|
|
582
581
|
if keep_input:
|
|
583
582
|
return result
|
|
584
583
|
else:
|
|
585
|
-
return result
|
|
584
|
+
return drop_existing_columns(result, X.columns)
|
|
586
585
|
|
|
587
586
|
def calculate_metrics(
|
|
588
587
|
self,
|
|
@@ -1002,7 +1001,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1002
1001
|
self.logger.info("Cached enriched dataset found - use it")
|
|
1003
1002
|
X_sampled, y_sampled, enriched_X, eval_set_sampled_dict, search_keys = self.__cached_sampled_datasets
|
|
1004
1003
|
if exclude_features_sources:
|
|
1005
|
-
enriched_X = enriched_X
|
|
1004
|
+
enriched_X = drop_existing_columns(enriched_X, exclude_features_sources)
|
|
1006
1005
|
elif len(self.feature_importances_) == 0:
|
|
1007
1006
|
self.logger.info("No external features selected. So use only input datasets for metrics calculation")
|
|
1008
1007
|
X_sampled, search_keys = self._extend_x(validated_X)
|
|
@@ -1024,7 +1023,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1024
1023
|
self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True),
|
|
1025
1024
|
)
|
|
1026
1025
|
|
|
1027
|
-
enriched_X = enriched_Xy
|
|
1026
|
+
enriched_X = drop_existing_columns(enriched_Xy, TARGET)
|
|
1028
1027
|
x_columns = [
|
|
1029
1028
|
c for c in validated_X.columns.to_list() + self.fit_generated_features if c in enriched_X.columns
|
|
1030
1029
|
]
|
|
@@ -1042,7 +1041,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1042
1041
|
)
|
|
1043
1042
|
|
|
1044
1043
|
for idx in range(len(eval_set)):
|
|
1045
|
-
enriched_eval_X = enriched_eval_sets[idx + 1]
|
|
1044
|
+
enriched_eval_X = drop_existing_columns(enriched_eval_sets[idx + 1], TARGET)
|
|
1046
1045
|
eval_X_sampled = enriched_eval_sets[idx + 1][x_columns].copy()
|
|
1047
1046
|
eval_y_sampled = enriched_eval_sets[idx + 1][TARGET].copy()
|
|
1048
1047
|
eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
|
|
@@ -1160,6 +1159,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1160
1159
|
fitting_X = X_sorted[client_features].copy()
|
|
1161
1160
|
fitting_enriched_X = enriched_X_sorted[client_features + existing_filtered_enriched_features].copy()
|
|
1162
1161
|
|
|
1162
|
+
# Detect and drop high cardinality columns in train
|
|
1163
|
+
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1164
|
+
fitting_X = drop_existing_columns(fitting_X, columns_with_high_cardinality)
|
|
1165
|
+
fitting_enriched_X = drop_existing_columns(fitting_enriched_X, columns_with_high_cardinality)
|
|
1166
|
+
|
|
1163
1167
|
fitting_eval_set_dict = dict()
|
|
1164
1168
|
for idx, eval_tuple in eval_set_sampled_dict.items():
|
|
1165
1169
|
eval_X_sampled, enriched_eval_X, eval_y_sampled = eval_tuple
|
|
@@ -1171,6 +1175,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1171
1175
|
fitting_enriched_eval_X = enriched_eval_X_sorted[
|
|
1172
1176
|
client_features + existing_filtered_enriched_features
|
|
1173
1177
|
].copy()
|
|
1178
|
+
|
|
1179
|
+
# Drop high cardinality columns in eval set
|
|
1180
|
+
fitting_eval_X = drop_existing_columns(fitting_eval_X, columns_with_high_cardinality)
|
|
1181
|
+
fitting_enriched_eval_X = drop_existing_columns(fitting_enriched_eval_X, columns_with_high_cardinality)
|
|
1182
|
+
|
|
1174
1183
|
fitting_eval_set_dict[idx] = (
|
|
1175
1184
|
fitting_eval_X,
|
|
1176
1185
|
eval_y_sorted,
|
|
@@ -2027,7 +2036,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2027
2036
|
self.feature_importances_.append(round_shap_value(feature_meta.shap_value))
|
|
2028
2037
|
|
|
2029
2038
|
internal_provider = feature_meta.data_provider or ""
|
|
2030
|
-
if feature_meta.data_provider
|
|
2039
|
+
if feature_meta.data_provider:
|
|
2031
2040
|
provider = (
|
|
2032
2041
|
f"<a href='{feature_meta.data_provider_link}' "
|
|
2033
2042
|
"target='_blank' rel='noopener noreferrer'>"
|
|
@@ -2037,7 +2046,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2037
2046
|
provider = internal_provider
|
|
2038
2047
|
|
|
2039
2048
|
internal_source = feature_meta.data_source or ""
|
|
2040
|
-
if feature_meta.data_source
|
|
2049
|
+
if feature_meta.data_source:
|
|
2041
2050
|
source = (
|
|
2042
2051
|
f"<a href='{feature_meta.data_source_link}' "
|
|
2043
2052
|
"target='_blank' rel='noopener noreferrer'>"
|
|
@@ -2047,7 +2056,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2047
2056
|
source = internal_source
|
|
2048
2057
|
|
|
2049
2058
|
internal_feature_name = feature_meta.name
|
|
2050
|
-
if feature_meta.doc_link
|
|
2059
|
+
if feature_meta.doc_link:
|
|
2051
2060
|
feature_name = (
|
|
2052
2061
|
f"<a href='{feature_meta.doc_link}' "
|
|
2053
2062
|
"target='_blank' rel='noopener noreferrer'>"
|
|
@@ -2247,7 +2256,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2247
2256
|
self.warning_counter.increment()
|
|
2248
2257
|
except (ImportError, NameError):
|
|
2249
2258
|
print(msg)
|
|
2250
|
-
print(self.
|
|
2259
|
+
print(self._internal_features_info)
|
|
2251
2260
|
|
|
2252
2261
|
def __validate_importance_threshold(self, importance_threshold: Optional[float]) -> float:
|
|
2253
2262
|
try:
|
|
@@ -2487,3 +2496,14 @@ def drop_duplicates(df: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
|
|
|
2487
2496
|
|
|
2488
2497
|
def hash_row(row) -> int:
|
|
2489
2498
|
return zlib.crc32(str(row).encode())
|
|
2499
|
+
|
|
2500
|
+
|
|
2501
|
+
def drop_existing_columns(df: pd.DataFrame, columns_to_drop: Union[List[str], str]) -> pd.DataFrame:
|
|
2502
|
+
if isinstance(columns_to_drop, str):
|
|
2503
|
+
columns_to_drop = [columns_to_drop] if columns_to_drop in df.columns else []
|
|
2504
|
+
elif hasattr(columns_to_drop, "__iter__"):
|
|
2505
|
+
columns_to_drop = [c for c in columns_to_drop if c in df.columns]
|
|
2506
|
+
else:
|
|
2507
|
+
return df
|
|
2508
|
+
|
|
2509
|
+
return df.drop(columns=columns_to_drop)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
from copy import deepcopy
|
|
2
3
|
from typing import Callable, List, Optional, Tuple, Union
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
@@ -6,7 +7,7 @@ import pandas as pd
|
|
|
6
7
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
7
8
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
8
9
|
from numpy import log1p
|
|
9
|
-
from pandas.api.types import is_numeric_dtype
|
|
10
|
+
from pandas.api.types import is_numeric_dtype
|
|
10
11
|
from sklearn.metrics import SCORERS, check_scoring, get_scorer, make_scorer
|
|
11
12
|
from sklearn.metrics._regression import (
|
|
12
13
|
_check_reg_targets,
|
|
@@ -14,7 +15,6 @@ from sklearn.metrics._regression import (
|
|
|
14
15
|
mean_squared_error,
|
|
15
16
|
)
|
|
16
17
|
from sklearn.model_selection import BaseCrossValidator, cross_validate
|
|
17
|
-
from copy import deepcopy
|
|
18
18
|
|
|
19
19
|
from upgini.errors import ValidationError
|
|
20
20
|
from upgini.metadata import ModelTaskType
|
|
@@ -82,15 +82,6 @@ class EstimatorWrapper:
|
|
|
82
82
|
else:
|
|
83
83
|
X[c] = X[c].astype(str)
|
|
84
84
|
|
|
85
|
-
# Remove high cardinality columns
|
|
86
|
-
row_count = X.shape[0]
|
|
87
|
-
columns_cardinality = [
|
|
88
|
-
i
|
|
89
|
-
for i in X
|
|
90
|
-
if (is_string_dtype(X[i]) or is_integer_dtype(X[i])) and (X[i].nunique() / row_count >= 0.9)
|
|
91
|
-
]
|
|
92
|
-
X = X.drop(columns=columns_cardinality)
|
|
93
|
-
|
|
94
85
|
if not isinstance(y, pd.Series):
|
|
95
86
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
|
|
96
87
|
|
|
@@ -98,7 +89,7 @@ class EstimatorWrapper:
|
|
|
98
89
|
joined = joined[joined[y.name].notna()]
|
|
99
90
|
joined = joined.reset_index(drop=True)
|
|
100
91
|
X = joined.drop(columns=y.name)
|
|
101
|
-
y = joined[y.name].values
|
|
92
|
+
y = np.array(list(joined[y.name].values))
|
|
102
93
|
return X, y, {}
|
|
103
94
|
|
|
104
95
|
def cross_val_predict(self, X: pd.DataFrame, y: np.ndarray):
|
|
@@ -167,9 +158,9 @@ class EstimatorWrapper:
|
|
|
167
158
|
kwargs["estimator"] = estimator_copy
|
|
168
159
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
169
160
|
if cat_features is not None:
|
|
170
|
-
estimator_copy.set_params(
|
|
171
|
-
X.columns.get_loc(cat_feature) for cat_feature in cat_features
|
|
172
|
-
|
|
161
|
+
estimator_copy.set_params(
|
|
162
|
+
cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
163
|
+
)
|
|
173
164
|
estimator = CatBoostWrapper(**kwargs)
|
|
174
165
|
else:
|
|
175
166
|
try:
|
|
@@ -312,11 +303,19 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
|
|
|
312
303
|
supported_metrics = set(SCORERS.keys())
|
|
313
304
|
neg_metrics = [m[4:] for m in supported_metrics if m.startswith("neg_")]
|
|
314
305
|
supported_metrics.update(neg_metrics)
|
|
315
|
-
supported_metrics.update(
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
306
|
+
supported_metrics.update(
|
|
307
|
+
[
|
|
308
|
+
"mean_squared_log_error",
|
|
309
|
+
"MSLE",
|
|
310
|
+
"msle",
|
|
311
|
+
"root_mean_squared_log_error",
|
|
312
|
+
"RMSLE",
|
|
313
|
+
"rmsle",
|
|
314
|
+
"root_mean_squared_error",
|
|
315
|
+
"RMSE",
|
|
316
|
+
"rmse",
|
|
317
|
+
]
|
|
318
|
+
)
|
|
320
319
|
raise ValidationError(bundle.get("metrics_invalid_scoring").format(scoring, sorted(supported_metrics)))
|
|
321
320
|
elif hasattr(scoring, "__name__"):
|
|
322
321
|
metric_name = scoring.__name__
|
|
@@ -3,7 +3,7 @@ from logging import Logger
|
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from pandas.api.types import is_object_dtype
|
|
6
|
+
from pandas.api.types import is_object_dtype, is_integer_dtype, is_string_dtype
|
|
7
7
|
from upgini.resource_bundle import bundle
|
|
8
8
|
from upgini.utils.warning_counter import WarningCounter
|
|
9
9
|
|
|
@@ -53,3 +53,13 @@ class FeaturesValidator:
|
|
|
53
53
|
self.logger.warning(msg)
|
|
54
54
|
|
|
55
55
|
return empty_or_constant_features
|
|
56
|
+
|
|
57
|
+
@staticmethod
|
|
58
|
+
def find_high_cardinality(df: pd.DataFrame):
|
|
59
|
+
# Remove high cardinality columns
|
|
60
|
+
row_count = df.shape[0]
|
|
61
|
+
return [
|
|
62
|
+
i
|
|
63
|
+
for i in df
|
|
64
|
+
if (is_string_dtype(df[i]) or is_integer_dtype(df[i])) and (df[i].nunique() / row_count >= 0.9)
|
|
65
|
+
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.129
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -66,7 +66,7 @@ License-File: LICENSE
|
|
|
66
66
|
## 🚀 Awesome features
|
|
67
67
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
68
68
|
⭐️ Data source optimizations for ML tasks to *"squeeze" maximum information for models* out of the source data: automated feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
69
|
-
⭐️
|
|
69
|
+
⭐️ *Automatic search key augmentation* from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
70
70
|
⭐️ Calculate *accuracy metrics and uplifts* after enrichment existing ML model with external features
|
|
71
71
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
72
72
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|