upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -0
- upgini/ads.py +6 -2
- upgini/ads_management/ads_manager.py +4 -2
- upgini/autofe/all_operands.py +16 -4
- upgini/autofe/binary.py +2 -1
- upgini/autofe/date.py +74 -7
- upgini/autofe/feature.py +1 -1
- upgini/autofe/groupby.py +3 -1
- upgini/autofe/operand.py +4 -3
- upgini/autofe/unary.py +20 -1
- upgini/autofe/vector.py +2 -0
- upgini/data_source/data_source_publisher.py +14 -4
- upgini/dataset.py +8 -7
- upgini/errors.py +1 -1
- upgini/features_enricher.py +156 -63
- upgini/http.py +11 -10
- upgini/mdc/__init__.py +1 -3
- upgini/mdc/context.py +4 -6
- upgini/metadata.py +3 -0
- upgini/metrics.py +160 -96
- upgini/normalizer/phone_normalizer.py +2 -2
- upgini/resource_bundle/__init__.py +5 -5
- upgini/resource_bundle/strings.properties +9 -4
- upgini/sampler/base.py +1 -4
- upgini/sampler/random_under_sampler.py +2 -5
- upgini/search_task.py +4 -4
- upgini/spinner.py +1 -1
- upgini/utils/__init__.py +3 -2
- upgini/utils/base_search_key_detector.py +2 -2
- upgini/utils/blocked_time_series.py +4 -2
- upgini/utils/country_utils.py +2 -2
- upgini/utils/custom_loss_utils.py +3 -2
- upgini/utils/cv_utils.py +2 -2
- upgini/utils/datetime_utils.py +75 -18
- upgini/utils/deduplicate_utils.py +61 -18
- upgini/utils/email_utils.py +3 -3
- upgini/utils/fallback_progress_bar.py +1 -1
- upgini/utils/features_validator.py +2 -1
- upgini/utils/progress_bar.py +1 -1
- upgini/utils/sklearn_ext.py +15 -15
- upgini/utils/target_utils.py +21 -7
- upgini/utils/track_info.py +27 -15
- upgini/version_validator.py +2 -2
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
- upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
- upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
- upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
- {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0
upgini/features_enricher.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import datetime
|
|
2
3
|
import gc
|
|
3
4
|
import hashlib
|
|
4
5
|
import itertools
|
|
@@ -20,6 +21,7 @@ from pandas.api.types import (
|
|
|
20
21
|
is_bool,
|
|
21
22
|
is_datetime64_any_dtype,
|
|
22
23
|
is_numeric_dtype,
|
|
24
|
+
is_object_dtype,
|
|
23
25
|
is_period_dtype,
|
|
24
26
|
is_string_dtype,
|
|
25
27
|
)
|
|
@@ -70,6 +72,7 @@ from upgini.utils.datetime_utils import (
|
|
|
70
72
|
DateTimeSearchKeyConverter,
|
|
71
73
|
is_blocked_time_series,
|
|
72
74
|
is_time_series,
|
|
75
|
+
validate_dates_distribution,
|
|
73
76
|
)
|
|
74
77
|
from upgini.utils.deduplicate_utils import (
|
|
75
78
|
clean_full_duplicates,
|
|
@@ -93,7 +96,7 @@ try:
|
|
|
93
96
|
except Exception:
|
|
94
97
|
from upgini.utils.fallback_progress_bar import CustomFallbackProgressBar as ProgressBar
|
|
95
98
|
|
|
96
|
-
from upgini.utils.target_utils import define_task
|
|
99
|
+
from upgini.utils.target_utils import calculate_psi, define_task
|
|
97
100
|
from upgini.utils.warning_counter import WarningCounter
|
|
98
101
|
from upgini.version_validator import validate_version
|
|
99
102
|
|
|
@@ -145,6 +148,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
145
148
|
"""
|
|
146
149
|
|
|
147
150
|
TARGET_NAME = "target"
|
|
151
|
+
CURRENT_DATE = "current_date"
|
|
148
152
|
RANDOM_STATE = 42
|
|
149
153
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
150
154
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -206,6 +210,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
206
210
|
client_ip: Optional[str] = None,
|
|
207
211
|
client_visitorid: Optional[str] = None,
|
|
208
212
|
custom_bundle_config: Optional[str] = None,
|
|
213
|
+
add_date_if_missing: bool = True,
|
|
209
214
|
**kwargs,
|
|
210
215
|
):
|
|
211
216
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -316,6 +321,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
316
321
|
self.raise_validation_error = raise_validation_error
|
|
317
322
|
self.exclude_columns = exclude_columns
|
|
318
323
|
self.baseline_score_column = baseline_score_column
|
|
324
|
+
self.add_date_if_missing = add_date_if_missing
|
|
319
325
|
|
|
320
326
|
def _get_api_key(self):
|
|
321
327
|
return self._api_key
|
|
@@ -419,11 +425,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
419
425
|
|
|
420
426
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
421
427
|
|
|
428
|
+
# Validate client estimator params
|
|
429
|
+
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
430
|
+
|
|
422
431
|
try:
|
|
423
432
|
self.X = X
|
|
424
433
|
self.y = y
|
|
425
434
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
426
|
-
self.dump_input(trace_id, X, y, eval_set)
|
|
435
|
+
self.dump_input(trace_id, X, y, self.eval_set)
|
|
427
436
|
self.__inner_fit(
|
|
428
437
|
trace_id,
|
|
429
438
|
X,
|
|
@@ -562,7 +571,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
562
571
|
self.X = X
|
|
563
572
|
self.y = y
|
|
564
573
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
565
|
-
self.dump_input(trace_id, X, y, eval_set)
|
|
574
|
+
self.dump_input(trace_id, X, y, self.eval_set)
|
|
566
575
|
|
|
567
576
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
568
577
|
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
|
|
@@ -812,6 +821,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
812
821
|
trace_id = trace_id or str(uuid.uuid4())
|
|
813
822
|
start_time = time.time()
|
|
814
823
|
with MDC(trace_id=trace_id):
|
|
824
|
+
self.logger.info("Start calculate metrics")
|
|
815
825
|
if len(args) > 0:
|
|
816
826
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
817
827
|
self.logger.warning(msg)
|
|
@@ -822,12 +832,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
822
832
|
print(msg)
|
|
823
833
|
|
|
824
834
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
835
|
+
effective_X = X if X is not None else self.X
|
|
836
|
+
effective_y = y if y is not None else self.y
|
|
837
|
+
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
838
|
+
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
|
|
825
839
|
|
|
826
840
|
try:
|
|
827
841
|
self.__log_debug_information(
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
842
|
+
effective_X,
|
|
843
|
+
effective_y,
|
|
844
|
+
effective_eval_set,
|
|
831
845
|
exclude_features_sources=exclude_features_sources,
|
|
832
846
|
cv=cv if cv is not None else self.cv,
|
|
833
847
|
importance_threshold=importance_threshold,
|
|
@@ -841,17 +855,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
841
855
|
self._search_task is None
|
|
842
856
|
or self._search_task.provider_metadata_v2 is None
|
|
843
857
|
or len(self._search_task.provider_metadata_v2) == 0
|
|
844
|
-
or
|
|
845
|
-
or
|
|
858
|
+
or effective_X is None
|
|
859
|
+
or effective_y is None
|
|
846
860
|
):
|
|
847
861
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
848
862
|
|
|
849
863
|
if X is not None and y is None:
|
|
850
864
|
raise ValidationError("X passed without y")
|
|
851
865
|
|
|
852
|
-
effective_X = X if X is not None else self.X
|
|
853
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
854
|
-
|
|
855
866
|
validate_scoring_argument(scoring)
|
|
856
867
|
|
|
857
868
|
self._validate_baseline_score(effective_X, effective_eval_set)
|
|
@@ -862,29 +873,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
862
873
|
self.__display_support_link(msg)
|
|
863
874
|
return None
|
|
864
875
|
|
|
865
|
-
cat_features =
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
estimator is not None
|
|
869
|
-
and hasattr(estimator, "get_param")
|
|
870
|
-
and estimator.get_param("cat_features") is not None
|
|
871
|
-
):
|
|
872
|
-
cat_features = estimator.get_param("cat_features")
|
|
873
|
-
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
874
|
-
effectiveX = X or self.X
|
|
875
|
-
cat_features = [effectiveX.columns[i] for i in cat_features]
|
|
876
|
-
for cat_feature in cat_features:
|
|
877
|
-
if cat_feature in self.search_keys:
|
|
878
|
-
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
879
|
-
search_keys_for_metrics.append(cat_feature)
|
|
880
|
-
else:
|
|
881
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
876
|
+
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
877
|
+
estimator, effective_X, self.search_keys
|
|
878
|
+
)
|
|
882
879
|
|
|
883
880
|
prepared_data = self._prepare_data_for_metrics(
|
|
884
881
|
trace_id=trace_id,
|
|
885
|
-
X=
|
|
886
|
-
y=
|
|
887
|
-
eval_set=
|
|
882
|
+
X=effective_X,
|
|
883
|
+
y=effective_y,
|
|
884
|
+
eval_set=effective_eval_set,
|
|
888
885
|
exclude_features_sources=exclude_features_sources,
|
|
889
886
|
importance_threshold=importance_threshold,
|
|
890
887
|
max_features=max_features,
|
|
@@ -893,6 +890,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
893
890
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
894
891
|
progress_bar=progress_bar,
|
|
895
892
|
progress_callback=progress_callback,
|
|
893
|
+
cat_features=cat_features,
|
|
896
894
|
)
|
|
897
895
|
if prepared_data is None:
|
|
898
896
|
return None
|
|
@@ -994,8 +992,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
994
992
|
enriched_metric = None
|
|
995
993
|
uplift = None
|
|
996
994
|
|
|
997
|
-
effective_X = X if X is not None else self.X
|
|
998
|
-
effective_y = y if y is not None else self.y
|
|
999
995
|
train_metrics = {
|
|
1000
996
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1001
997
|
"quality_metrics_train_segment"
|
|
@@ -1256,6 +1252,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1256
1252
|
).get_cv_and_groups(X)
|
|
1257
1253
|
else:
|
|
1258
1254
|
from sklearn import __version__ as sklearn_version
|
|
1255
|
+
|
|
1259
1256
|
try:
|
|
1260
1257
|
from sklearn.model_selection._split import GroupsConsumerMixin
|
|
1261
1258
|
|
|
@@ -1269,6 +1266,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1269
1266
|
|
|
1270
1267
|
return _cv, groups
|
|
1271
1268
|
|
|
1269
|
+
def _get_client_cat_features(
|
|
1270
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1271
|
+
) -> Optional[List[str]]:
|
|
1272
|
+
cat_features = None
|
|
1273
|
+
search_keys_for_metrics = []
|
|
1274
|
+
if (
|
|
1275
|
+
estimator is not None
|
|
1276
|
+
and hasattr(estimator, "get_param")
|
|
1277
|
+
and estimator.get_param("cat_features") is not None
|
|
1278
|
+
):
|
|
1279
|
+
cat_features = estimator.get_param("cat_features")
|
|
1280
|
+
if len(cat_features) > 0:
|
|
1281
|
+
if all([isinstance(f, int) for f in cat_features]):
|
|
1282
|
+
cat_features = [X.columns[i] for i in cat_features]
|
|
1283
|
+
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1284
|
+
for cat_feature in cat_features:
|
|
1285
|
+
if cat_feature in search_keys:
|
|
1286
|
+
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1287
|
+
search_keys_for_metrics.append(cat_feature)
|
|
1288
|
+
else:
|
|
1289
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1290
|
+
return cat_features, search_keys_for_metrics
|
|
1291
|
+
|
|
1272
1292
|
def _prepare_data_for_metrics(
|
|
1273
1293
|
self,
|
|
1274
1294
|
trace_id: str,
|
|
@@ -1283,6 +1303,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1283
1303
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1284
1304
|
progress_bar: Optional[ProgressBar] = None,
|
|
1285
1305
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1306
|
+
cat_features: Optional[List[str]] = None,
|
|
1286
1307
|
):
|
|
1287
1308
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1288
1309
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1340,9 +1361,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1340
1361
|
|
|
1341
1362
|
# Detect and drop high cardinality columns in train
|
|
1342
1363
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
]
|
|
1364
|
+
non_excluding_columns = (self.generate_features or []) + (cat_features or [])
|
|
1365
|
+
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
|
1346
1366
|
if len(columns_with_high_cardinality) > 0:
|
|
1347
1367
|
self.logger.warning(
|
|
1348
1368
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -1684,6 +1704,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1684
1704
|
df = validated_X.copy()
|
|
1685
1705
|
|
|
1686
1706
|
df[TARGET] = validated_y
|
|
1707
|
+
|
|
1708
|
+
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
1709
|
+
|
|
1687
1710
|
num_samples = _num_samples(df)
|
|
1688
1711
|
if num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1689
1712
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
@@ -1801,10 +1824,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1801
1824
|
else:
|
|
1802
1825
|
features_section = ""
|
|
1803
1826
|
|
|
1804
|
-
|
|
1827
|
+
search_id = self._search_task.search_task_id
|
|
1828
|
+
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1805
1829
|
-H 'Authorization: {self.api_key}' \\
|
|
1806
1830
|
-H 'Content-Type: application/json' \\
|
|
1807
|
-
-d '{{"
|
|
1831
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1808
1832
|
return api_example
|
|
1809
1833
|
|
|
1810
1834
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1899,6 +1923,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1899
1923
|
generated_features.extend(converter.generated_features)
|
|
1900
1924
|
else:
|
|
1901
1925
|
self.logger.info("Input dataset hasn't date column")
|
|
1926
|
+
if self.add_date_if_missing:
|
|
1927
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1902
1928
|
email_column = self._get_email_column(search_keys)
|
|
1903
1929
|
hem_column = self._get_hem_column(search_keys)
|
|
1904
1930
|
email_converted_to_hem = False
|
|
@@ -1918,6 +1944,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1918
1944
|
|
|
1919
1945
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1920
1946
|
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1947
|
+
|
|
1921
1948
|
if email_converted_to_hem:
|
|
1922
1949
|
non_keys_columns.append(email_column)
|
|
1923
1950
|
|
|
@@ -1939,6 +1966,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1939
1966
|
if add_fit_system_record_id:
|
|
1940
1967
|
df = self.__add_fit_system_record_id(df, dict(), search_keys)
|
|
1941
1968
|
df = df.rename(columns={SYSTEM_RECORD_ID: SORT_ID})
|
|
1969
|
+
non_keys_columns.append(SORT_ID)
|
|
1942
1970
|
|
|
1943
1971
|
columns_for_system_record_id = sorted(list(search_keys.keys()) + (original_features_for_transform or []))
|
|
1944
1972
|
|
|
@@ -2215,14 +2243,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2215
2243
|
self.fit_search_keys = self.search_keys.copy()
|
|
2216
2244
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2217
2245
|
|
|
2218
|
-
|
|
2246
|
+
validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2247
|
+
|
|
2248
|
+
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2249
|
+
has_date = maybe_date_column is not None
|
|
2219
2250
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2220
2251
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
2221
2252
|
|
|
2222
|
-
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
2223
|
-
|
|
2224
|
-
df = self.__correct_target(df)
|
|
2225
|
-
|
|
2226
2253
|
self.runtime_parameters = get_runtime_params_custom_loss(
|
|
2227
2254
|
self.loss, model_task_type, self.runtime_parameters, self.logger
|
|
2228
2255
|
)
|
|
@@ -2234,6 +2261,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2234
2261
|
eval_df[EVAL_SET_INDEX] = idx + 1
|
|
2235
2262
|
df = pd.concat([df, eval_df])
|
|
2236
2263
|
|
|
2264
|
+
df = self.__correct_target(df)
|
|
2265
|
+
|
|
2266
|
+
df = self.__handle_index_search_keys(df, self.fit_search_keys)
|
|
2267
|
+
|
|
2268
|
+
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2269
|
+
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2270
|
+
|
|
2237
2271
|
if DEFAULT_INDEX in df.columns:
|
|
2238
2272
|
msg = self.bundle.get("unsupported_index_column")
|
|
2239
2273
|
self.logger.info(msg)
|
|
@@ -2260,6 +2294,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2260
2294
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2261
2295
|
else:
|
|
2262
2296
|
self.logger.info("Input dataset hasn't date column")
|
|
2297
|
+
if self.add_date_if_missing:
|
|
2298
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2263
2299
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2264
2300
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2265
2301
|
email_converted_to_hem = False
|
|
@@ -2512,7 +2548,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2512
2548
|
validated_X = X.copy()
|
|
2513
2549
|
elif isinstance(X, pd.Series):
|
|
2514
2550
|
validated_X = X.to_frame()
|
|
2515
|
-
elif isinstance(X, np.ndarray)
|
|
2551
|
+
elif isinstance(X, (list, np.ndarray)):
|
|
2516
2552
|
validated_X = pd.DataFrame(X)
|
|
2517
2553
|
renaming = {c: str(c) for c in validated_X.columns}
|
|
2518
2554
|
validated_X = validated_X.rename(columns=renaming)
|
|
@@ -2601,7 +2637,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2601
2637
|
validated_eval_X = eval_X.copy()
|
|
2602
2638
|
elif isinstance(eval_X, pd.Series):
|
|
2603
2639
|
validated_eval_X = eval_X.to_frame()
|
|
2604
|
-
elif isinstance(eval_X, np.ndarray)
|
|
2640
|
+
elif isinstance(eval_X, (list, np.ndarray)):
|
|
2605
2641
|
validated_eval_X = pd.DataFrame(eval_X)
|
|
2606
2642
|
renaming = {c: str(c) for c in validated_eval_X.columns}
|
|
2607
2643
|
validated_eval_X = validated_eval_X.rename(columns=renaming)
|
|
@@ -2783,7 +2819,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2783
2819
|
)
|
|
2784
2820
|
|
|
2785
2821
|
def sample(df):
|
|
2786
|
-
if isinstance(df, pd.
|
|
2822
|
+
if isinstance(df, (pd.DataFrame, pd.Series)):
|
|
2787
2823
|
return df.head(10)
|
|
2788
2824
|
else:
|
|
2789
2825
|
return df[:10]
|
|
@@ -2808,6 +2844,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2808
2844
|
|
|
2809
2845
|
maybe_date_col = self._get_date_column(self.search_keys)
|
|
2810
2846
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2847
|
+
# TODO cast date column to single dtype
|
|
2811
2848
|
min_date = X[maybe_date_col].min()
|
|
2812
2849
|
max_date = X[maybe_date_col].max()
|
|
2813
2850
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
|
@@ -2839,6 +2876,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2839
2876
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2840
2877
|
return col
|
|
2841
2878
|
|
|
2879
|
+
@staticmethod
|
|
2880
|
+
def _add_current_date_as_key(
|
|
2881
|
+
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2882
|
+
) -> pd.DataFrame:
|
|
2883
|
+
if (
|
|
2884
|
+
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2885
|
+
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2886
|
+
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2887
|
+
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2888
|
+
):
|
|
2889
|
+
msg = bundle.get("current_date_added")
|
|
2890
|
+
print(msg)
|
|
2891
|
+
logger.warning(msg)
|
|
2892
|
+
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2893
|
+
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2894
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2895
|
+
df = converter.convert(df)
|
|
2896
|
+
return df
|
|
2897
|
+
|
|
2842
2898
|
@staticmethod
|
|
2843
2899
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2844
2900
|
return [
|
|
@@ -2877,26 +2933,33 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2877
2933
|
|
|
2878
2934
|
# order by date and idempotent order by other keys
|
|
2879
2935
|
if self.cv not in [CVType.time_series, CVType.blocked_time_series]:
|
|
2936
|
+
sort_exclude_columns = [original_order_name, ORIGINAL_INDEX, EVAL_SET_INDEX, TARGET, "__target"]
|
|
2880
2937
|
if DateTimeSearchKeyConverter.DATETIME_COL in df.columns:
|
|
2881
2938
|
date_column = DateTimeSearchKeyConverter.DATETIME_COL
|
|
2939
|
+
sort_exclude_columns.append(self._get_date_column(search_keys))
|
|
2882
2940
|
else:
|
|
2883
2941
|
date_column = self._get_date_column(search_keys)
|
|
2884
2942
|
sort_columns = [date_column] if date_column is not None else []
|
|
2885
2943
|
|
|
2886
|
-
|
|
2944
|
+
other_columns = sorted(
|
|
2887
2945
|
[
|
|
2888
|
-
|
|
2889
|
-
for
|
|
2890
|
-
if
|
|
2891
|
-
and sk in df.columns
|
|
2892
|
-
and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2946
|
+
c
|
|
2947
|
+
for c in df.columns
|
|
2948
|
+
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
2893
2949
|
]
|
|
2950
|
+
# [
|
|
2951
|
+
# sk
|
|
2952
|
+
# for sk, key_type in search_keys.items()
|
|
2953
|
+
# if key_type not in [SearchKey.DATE, SearchKey.DATETIME]
|
|
2954
|
+
# and sk in df.columns
|
|
2955
|
+
# and df[sk].nunique() > 1 # don't use constant keys for hash
|
|
2956
|
+
# ]
|
|
2894
2957
|
)
|
|
2895
2958
|
|
|
2896
2959
|
search_keys_hash = "search_keys_hash"
|
|
2897
|
-
if len(
|
|
2960
|
+
if len(other_columns) > 0:
|
|
2898
2961
|
sort_columns.append(search_keys_hash)
|
|
2899
|
-
df[search_keys_hash] = pd.util.hash_pandas_object(df[
|
|
2962
|
+
df[search_keys_hash] = pd.util.hash_pandas_object(df[other_columns], index=False)
|
|
2900
2963
|
|
|
2901
2964
|
df = df.sort_values(by=sort_columns)
|
|
2902
2965
|
|
|
@@ -2920,7 +2983,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2920
2983
|
|
|
2921
2984
|
def __correct_target(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
2922
2985
|
target = df[self.TARGET_NAME]
|
|
2923
|
-
if is_string_dtype(target):
|
|
2986
|
+
if is_string_dtype(target) or is_object_dtype(target):
|
|
2924
2987
|
maybe_numeric_target = pd.to_numeric(target, errors="coerce")
|
|
2925
2988
|
# If less than 5% is non numeric then leave this rows with NaN target and later it will be dropped
|
|
2926
2989
|
if maybe_numeric_target.isna().sum() <= _num_samples(df) * 0.05:
|
|
@@ -3185,22 +3248,23 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3185
3248
|
return None
|
|
3186
3249
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3187
3250
|
|
|
3188
|
-
def
|
|
3251
|
+
def get_feature_by_name(name: str):
|
|
3189
3252
|
for m in features_meta:
|
|
3190
|
-
if m.name
|
|
3253
|
+
if m.name == name:
|
|
3191
3254
|
return m
|
|
3192
3255
|
|
|
3193
3256
|
descriptions = []
|
|
3194
3257
|
for m in autofe_meta:
|
|
3195
3258
|
autofe_feature = Feature.from_formula(m.formula)
|
|
3259
|
+
orig_to_hashed = {base_column.original_name: base_column.hashed_name for base_column in m.base_columns}
|
|
3260
|
+
autofe_feature.rename_columns(orig_to_hashed)
|
|
3261
|
+
autofe_feature.set_display_index(m.display_index)
|
|
3196
3262
|
if autofe_feature.op.is_vector:
|
|
3197
3263
|
continue
|
|
3198
3264
|
|
|
3199
3265
|
description = dict()
|
|
3200
3266
|
|
|
3201
|
-
feature_meta =
|
|
3202
|
-
m.display_index, autofe_feature.op.alias or autofe_feature.op.name
|
|
3203
|
-
)
|
|
3267
|
+
feature_meta = get_feature_by_name(autofe_feature.get_display_name(shorten=True))
|
|
3204
3268
|
if feature_meta is None:
|
|
3205
3269
|
self.logger.warning(f"Feature meta for display index {m.display_index} not found")
|
|
3206
3270
|
continue
|
|
@@ -3321,7 +3385,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3321
3385
|
valid_search_keys[column_name] = SearchKey.CUSTOM_KEY
|
|
3322
3386
|
else:
|
|
3323
3387
|
if x[column_name].isnull().all() or (
|
|
3324
|
-
is_string_dtype(x[column_name])
|
|
3388
|
+
(is_string_dtype(x[column_name]) or is_object_dtype(x[column_name]))
|
|
3389
|
+
and (x[column_name].astype("string").str.strip() == "").all()
|
|
3325
3390
|
):
|
|
3326
3391
|
raise ValidationError(self.bundle.get("empty_search_key").format(column_name))
|
|
3327
3392
|
|
|
@@ -3547,6 +3612,34 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3547
3612
|
self.logger.warning(msg)
|
|
3548
3613
|
print(msg)
|
|
3549
3614
|
|
|
3615
|
+
def _validate_PSI(self, df: pd.DataFrame):
|
|
3616
|
+
if EVAL_SET_INDEX in df.columns:
|
|
3617
|
+
train = df.query(f"{EVAL_SET_INDEX} == 0")
|
|
3618
|
+
eval1 = df.query(f"{EVAL_SET_INDEX} == 1")
|
|
3619
|
+
else:
|
|
3620
|
+
train = df
|
|
3621
|
+
eval1 = None
|
|
3622
|
+
|
|
3623
|
+
# 1. Check train PSI
|
|
3624
|
+
half_train = round(len(train) / 2)
|
|
3625
|
+
part1 = train[:half_train]
|
|
3626
|
+
part2 = train[half_train:]
|
|
3627
|
+
train_psi = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
3628
|
+
if train_psi > 0.2:
|
|
3629
|
+
self.warning_counter.increment()
|
|
3630
|
+
msg = self.bundle.get("train_unstable_target").format(train_psi)
|
|
3631
|
+
print(msg)
|
|
3632
|
+
self.logger.warning(msg)
|
|
3633
|
+
|
|
3634
|
+
# 2. Check train-test PSI
|
|
3635
|
+
if eval1 is not None:
|
|
3636
|
+
train_test_psi = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
3637
|
+
if train_test_psi > 0.2:
|
|
3638
|
+
self.warning_counter.increment()
|
|
3639
|
+
msg = self.bundle.get("eval_unstable_target").format(train_test_psi)
|
|
3640
|
+
print(msg)
|
|
3641
|
+
self.logger.warning(msg)
|
|
3642
|
+
|
|
3550
3643
|
def _dump_python_libs(self):
|
|
3551
3644
|
try:
|
|
3552
3645
|
from pip._internal.operations.freeze import freeze
|
|
@@ -3600,7 +3693,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3600
3693
|
def sample(inp, sample_index):
|
|
3601
3694
|
if _num_samples(inp) <= 1000:
|
|
3602
3695
|
return inp
|
|
3603
|
-
if isinstance(inp, pd.DataFrame
|
|
3696
|
+
if isinstance(inp, (pd.DataFrame, pd.Series)):
|
|
3604
3697
|
return inp.sample(n=1000, random_state=random_state)
|
|
3605
3698
|
if isinstance(inp, np.ndarray):
|
|
3606
3699
|
return inp[sample_index]
|
|
@@ -3613,7 +3706,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3613
3706
|
if y is not None:
|
|
3614
3707
|
with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
|
|
3615
3708
|
pickle.dump(sample(y, xy_sample_index), y_file)
|
|
3616
|
-
if eval_set
|
|
3709
|
+
if eval_set:
|
|
3617
3710
|
eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
|
|
3618
3711
|
with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
|
|
3619
3712
|
pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
|
upgini/http.py
CHANGED
|
@@ -22,6 +22,7 @@ from pydantic import BaseModel
|
|
|
22
22
|
from pythonjsonlogger import jsonlogger
|
|
23
23
|
from requests.exceptions import RequestException
|
|
24
24
|
|
|
25
|
+
from upgini.__about__ import __version__
|
|
25
26
|
from upgini.errors import (
|
|
26
27
|
HttpError,
|
|
27
28
|
UnauthorizedError,
|
|
@@ -38,17 +39,17 @@ from upgini.metadata import (
|
|
|
38
39
|
from upgini.resource_bundle import bundle
|
|
39
40
|
from upgini.utils.track_info import get_track_metrics
|
|
40
41
|
|
|
41
|
-
try:
|
|
42
|
-
|
|
42
|
+
# try:
|
|
43
|
+
# from importlib.metadata import version # type: ignore
|
|
43
44
|
|
|
44
|
-
|
|
45
|
-
except ImportError:
|
|
46
|
-
|
|
47
|
-
|
|
45
|
+
# __version__ = version("upgini")
|
|
46
|
+
# except ImportError:
|
|
47
|
+
# try:
|
|
48
|
+
# from importlib_metadata import version # type: ignore
|
|
48
49
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
50
|
+
# __version__ = version("upgini")
|
|
51
|
+
# except ImportError:
|
|
52
|
+
# __version__ = "Upgini wasn't installed"
|
|
52
53
|
|
|
53
54
|
UPGINI_URL: str = "UPGINI_URL"
|
|
54
55
|
UPGINI_API_KEY: str = "UPGINI_API_KEY"
|
|
@@ -925,7 +926,7 @@ def is_demo_api_key(api_token: Optional[str]) -> bool:
|
|
|
925
926
|
return api_token is None or api_token == "" or api_token == DEMO_API_KEY
|
|
926
927
|
|
|
927
928
|
|
|
928
|
-
@lru_cache
|
|
929
|
+
@lru_cache
|
|
929
930
|
def _get_rest_client(
|
|
930
931
|
backend_url: str, api_token: str, client_ip: Optional[str] = None, client_visitorid: Optional[str] = None
|
|
931
932
|
) -> _RestClient:
|
upgini/mdc/__init__.py
CHANGED
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
.. module: mdc
|
|
4
3
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
5
4
|
"""
|
|
6
|
-
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
7
5
|
|
|
8
6
|
import logging
|
|
9
7
|
|
|
10
|
-
from upgini.mdc.context import new_log_context, get_mdc_fields
|
|
11
8
|
from pythonjsonlogger import jsonlogger
|
|
12
9
|
|
|
10
|
+
from upgini.mdc.context import get_mdc_fields, new_log_context
|
|
13
11
|
|
|
14
12
|
MDContext = new_log_context
|
|
15
13
|
MDC = new_log_context
|
upgini/mdc/context.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
# -*- coding: utf-8 -*-
|
|
2
1
|
"""
|
|
3
2
|
.. module: TODO
|
|
4
3
|
:platform: TODO
|
|
@@ -7,12 +6,11 @@
|
|
|
7
6
|
.. moduleauthor:: Aljosha Friemann a.friemann@automate.wtf
|
|
8
7
|
"""
|
|
9
8
|
|
|
10
|
-
import
|
|
11
|
-
import uuid
|
|
9
|
+
import collections
|
|
12
10
|
import logging
|
|
13
11
|
import threading
|
|
14
|
-
import
|
|
15
|
-
|
|
12
|
+
import time
|
|
13
|
+
import uuid
|
|
16
14
|
from contextlib import contextmanager
|
|
17
15
|
|
|
18
16
|
LOGGER = logging.getLogger(__name__)
|
|
@@ -32,7 +30,7 @@ def get_mdc_fields():
|
|
|
32
30
|
|
|
33
31
|
@contextmanager
|
|
34
32
|
def new_log_context(**kwargs):
|
|
35
|
-
context_id = "mdc-{
|
|
33
|
+
context_id = f"mdc-{threading.current_thread().ident}-{uuid.uuid4()}"
|
|
36
34
|
|
|
37
35
|
LOGGER.debug("creating context %s", context_id)
|
|
38
36
|
|
upgini/metadata.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
from enum import Enum
|
|
2
4
|
from typing import Dict, List, Optional, Set
|
|
3
5
|
|
|
@@ -201,6 +203,7 @@ class FileMetadata(BaseModel):
|
|
|
201
203
|
for c in self.columns:
|
|
202
204
|
if c.name == name:
|
|
203
205
|
return c
|
|
206
|
+
return None
|
|
204
207
|
|
|
205
208
|
def search_types(self) -> Set[SearchKey]:
|
|
206
209
|
search_keys = set()
|