upgini 1.1.274__tar.gz → 1.1.274a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.274/src/upgini.egg-info → upgini-1.1.274a1}/PKG-INFO +2 -2
- {upgini-1.1.274 → upgini-1.1.274a1}/setup.py +2 -2
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/date.py +2 -9
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/data_source/data_source_publisher.py +1 -1
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/features_enricher.py +27 -66
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/metrics.py +0 -12
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/normalizer/phone_normalizer.py +2 -2
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/strings.properties +1 -2
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/datetime_utils.py +0 -3
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/track_info.py +13 -25
- {upgini-1.1.274 → upgini-1.1.274a1/src/upgini.egg-info}/PKG-INFO +2 -2
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/requires.txt +1 -1
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_autofe_operands.py +1 -2
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_features_enricher.py +13 -18
- {upgini-1.1.274 → upgini-1.1.274a1}/LICENSE +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/README.md +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/pyproject.toml +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/setup.cfg +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/ads.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/dataset.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/errors.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/http.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/metadata.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/search_task.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_country_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_email_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_metrics.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_target_utils.py +0 -0
- {upgini-1.1.274 → upgini-1.1.274a1}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.274a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil>=2.8.0
|
|
30
30
|
Requires-Dist: requests>=2.8.0
|
|
31
|
-
Requires-Dist: pandas<
|
|
31
|
+
Requires-Dist: pandas<2.1.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy>=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn>=1.3.0
|
|
34
34
|
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
@@ -40,7 +40,7 @@ def send_log(msg: str):
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
here = Path(__file__).parent.resolve()
|
|
43
|
-
version = "1.1.
|
|
43
|
+
version = "1.1.274a1"
|
|
44
44
|
try:
|
|
45
45
|
send_log(f"Start setup PyLib version {version}")
|
|
46
46
|
setup(
|
|
@@ -77,7 +77,7 @@ try:
|
|
|
77
77
|
install_requires=[
|
|
78
78
|
"python-dateutil>=2.8.0",
|
|
79
79
|
"requests>=2.8.0",
|
|
80
|
-
"pandas>=1.1.0,<
|
|
80
|
+
"pandas>=1.1.0,<2.1.0",
|
|
81
81
|
"numpy>=1.19.0",
|
|
82
82
|
"scikit-learn>=1.3.0",
|
|
83
83
|
"pydantic>=1.8.2,<2.0.0",
|
|
@@ -2,7 +2,6 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
-
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
6
5
|
|
|
7
6
|
from upgini.autofe.operand import PandasOperand
|
|
8
7
|
|
|
@@ -47,7 +46,6 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
47
46
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
48
47
|
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
49
48
|
)
|
|
50
|
-
future = pd.to_datetime(future)
|
|
51
49
|
before = future[future < left]
|
|
52
50
|
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
53
51
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
@@ -74,13 +72,8 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
74
72
|
|
|
75
73
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
76
74
|
|
|
77
|
-
def _diff(self, x
|
|
78
|
-
|
|
79
|
-
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
80
|
-
elif self.diff_unit == "M":
|
|
81
|
-
raise Exception("Unsupported difference unit: Month")
|
|
82
|
-
else:
|
|
83
|
-
x = x / np.timedelta64(1, self.diff_unit)
|
|
75
|
+
def _diff(self, x):
|
|
76
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
84
77
|
return x[x > 0]
|
|
85
78
|
|
|
86
79
|
def _agg(self, x):
|
|
@@ -48,7 +48,6 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
-
exclude_from_autofe_generation: Optional[List[str]],
|
|
52
51
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
53
52
|
sort_column: Optional[str] = None,
|
|
54
53
|
date_format: Optional[str] = None,
|
|
@@ -58,6 +57,7 @@ class DataSourcePublisher:
|
|
|
58
57
|
join_date_abs_limit_days: Optional[int] = None,
|
|
59
58
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
60
59
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
+
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
-
import datetime
|
|
3
2
|
import gc
|
|
4
3
|
import hashlib
|
|
5
4
|
import itertools
|
|
@@ -147,7 +146,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
147
146
|
"""
|
|
148
147
|
|
|
149
148
|
TARGET_NAME = "target"
|
|
150
|
-
CURRENT_DATE = "current_date"
|
|
151
149
|
RANDOM_STATE = 42
|
|
152
150
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
153
151
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -209,7 +207,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
209
207
|
client_ip: Optional[str] = None,
|
|
210
208
|
client_visitorid: Optional[str] = None,
|
|
211
209
|
custom_bundle_config: Optional[str] = None,
|
|
212
|
-
add_date_if_missing: bool = True,
|
|
213
210
|
**kwargs,
|
|
214
211
|
):
|
|
215
212
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -320,7 +317,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
320
317
|
self.raise_validation_error = raise_validation_error
|
|
321
318
|
self.exclude_columns = exclude_columns
|
|
322
319
|
self.baseline_score_column = baseline_score_column
|
|
323
|
-
self.add_date_if_missing = add_date_if_missing
|
|
324
320
|
|
|
325
321
|
def _get_api_key(self):
|
|
326
322
|
return self._api_key
|
|
@@ -424,9 +420,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
424
420
|
|
|
425
421
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
426
422
|
|
|
427
|
-
# Validate client estimator params
|
|
428
|
-
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
429
|
-
|
|
430
423
|
try:
|
|
431
424
|
self.X = X
|
|
432
425
|
self.y = y
|
|
@@ -820,7 +813,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
820
813
|
trace_id = trace_id or str(uuid.uuid4())
|
|
821
814
|
start_time = time.time()
|
|
822
815
|
with MDC(trace_id=trace_id):
|
|
823
|
-
self.logger.info("Start calculate metrics")
|
|
824
816
|
if len(args) > 0:
|
|
825
817
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
826
818
|
self.logger.warning(msg)
|
|
@@ -872,9 +864,22 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
872
864
|
self.__display_support_link(msg)
|
|
873
865
|
return None
|
|
874
866
|
|
|
875
|
-
cat_features
|
|
876
|
-
|
|
877
|
-
|
|
867
|
+
cat_features = None
|
|
868
|
+
search_keys_for_metrics = []
|
|
869
|
+
if (
|
|
870
|
+
estimator is not None
|
|
871
|
+
and hasattr(estimator, "get_param")
|
|
872
|
+
and estimator.get_param("cat_features") is not None
|
|
873
|
+
):
|
|
874
|
+
cat_features = estimator.get_param("cat_features")
|
|
875
|
+
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
876
|
+
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
877
|
+
for cat_feature in cat_features:
|
|
878
|
+
if cat_feature in self.search_keys:
|
|
879
|
+
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
880
|
+
search_keys_for_metrics.append(cat_feature)
|
|
881
|
+
else:
|
|
882
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
878
883
|
|
|
879
884
|
prepared_data = self._prepare_data_for_metrics(
|
|
880
885
|
trace_id=trace_id,
|
|
@@ -889,7 +894,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
889
894
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
890
895
|
progress_bar=progress_bar,
|
|
891
896
|
progress_callback=progress_callback,
|
|
892
|
-
cat_features=cat_features,
|
|
893
897
|
)
|
|
894
898
|
if prepared_data is None:
|
|
895
899
|
return None
|
|
@@ -1265,29 +1269,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1265
1269
|
|
|
1266
1270
|
return _cv, groups
|
|
1267
1271
|
|
|
1268
|
-
def _get_client_cat_features(
|
|
1269
|
-
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1270
|
-
) -> Optional[List[str]]:
|
|
1271
|
-
cat_features = None
|
|
1272
|
-
search_keys_for_metrics = []
|
|
1273
|
-
if (
|
|
1274
|
-
estimator is not None
|
|
1275
|
-
and hasattr(estimator, "get_param")
|
|
1276
|
-
and estimator.get_param("cat_features") is not None
|
|
1277
|
-
):
|
|
1278
|
-
cat_features = estimator.get_param("cat_features")
|
|
1279
|
-
if len(cat_features) > 0:
|
|
1280
|
-
if all([isinstance(f, int) for f in cat_features]):
|
|
1281
|
-
cat_features = [X.columns[i] for i in cat_features]
|
|
1282
|
-
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1283
|
-
for cat_feature in cat_features:
|
|
1284
|
-
if cat_feature in search_keys:
|
|
1285
|
-
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1286
|
-
search_keys_for_metrics.append(cat_feature)
|
|
1287
|
-
else:
|
|
1288
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1289
|
-
return cat_features, search_keys_for_metrics
|
|
1290
|
-
|
|
1291
1272
|
def _prepare_data_for_metrics(
|
|
1292
1273
|
self,
|
|
1293
1274
|
trace_id: str,
|
|
@@ -1302,7 +1283,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1302
1283
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1303
1284
|
progress_bar: Optional[ProgressBar] = None,
|
|
1304
1285
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1305
|
-
cat_features: Optional[List[str]] = None,
|
|
1306
1286
|
):
|
|
1307
1287
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1308
1288
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1360,8 +1340,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1360
1340
|
|
|
1361
1341
|
# Detect and drop high cardinality columns in train
|
|
1362
1342
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1363
|
-
|
|
1364
|
-
|
|
1343
|
+
columns_with_high_cardinality = [
|
|
1344
|
+
c for c in columns_with_high_cardinality if c not in (self.generate_features or [])
|
|
1345
|
+
]
|
|
1365
1346
|
if len(columns_with_high_cardinality) > 0:
|
|
1366
1347
|
self.logger.warning(
|
|
1367
1348
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -1823,11 +1804,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1823
1804
|
else:
|
|
1824
1805
|
features_section = ""
|
|
1825
1806
|
|
|
1826
|
-
|
|
1827
|
-
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1807
|
+
api_example = f"""curl 'https://inference-upgini.azurewebsites.net/api/http_inference_trigger' \\
|
|
1828
1808
|
-H 'Authorization: {self.api_key}' \\
|
|
1829
1809
|
-H 'Content-Type: application/json' \\
|
|
1830
|
-
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1810
|
+
-d '{{"search_id": "{self._search_task.search_task_id}", "search_keys": {keys}{features_section}}}'"""
|
|
1831
1811
|
return api_example
|
|
1832
1812
|
|
|
1833
1813
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1922,8 +1902,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1922
1902
|
generated_features.extend(converter.generated_features)
|
|
1923
1903
|
else:
|
|
1924
1904
|
self.logger.info("Input dataset hasn't date column")
|
|
1925
|
-
if self.add_date_if_missing:
|
|
1926
|
-
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1927
1905
|
email_column = self._get_email_column(search_keys)
|
|
1928
1906
|
hem_column = self._get_hem_column(search_keys)
|
|
1929
1907
|
email_converted_to_hem = False
|
|
@@ -2242,7 +2220,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2242
2220
|
self.fit_search_keys = self.search_keys.copy()
|
|
2243
2221
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2244
2222
|
|
|
2245
|
-
validate_dates_distribution(
|
|
2223
|
+
validate_dates_distribution(
|
|
2224
|
+
validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
|
|
2225
|
+
)
|
|
2246
2226
|
|
|
2247
2227
|
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2248
2228
|
has_date = maybe_date_column is not None
|
|
@@ -2293,8 +2273,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2293
2273
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2294
2274
|
else:
|
|
2295
2275
|
self.logger.info("Input dataset hasn't date column")
|
|
2296
|
-
if self.add_date_if_missing:
|
|
2297
|
-
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2298
2276
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2299
2277
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2300
2278
|
email_converted_to_hem = False
|
|
@@ -2875,25 +2853,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2875
2853
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2876
2854
|
return col
|
|
2877
2855
|
|
|
2878
|
-
@staticmethod
|
|
2879
|
-
def _add_current_date_as_key(
|
|
2880
|
-
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2881
|
-
) -> pd.DataFrame:
|
|
2882
|
-
if (
|
|
2883
|
-
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2884
|
-
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2885
|
-
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2886
|
-
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2887
|
-
):
|
|
2888
|
-
msg = bundle.get("current_date_added")
|
|
2889
|
-
print(msg)
|
|
2890
|
-
logger.warning(msg)
|
|
2891
|
-
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2892
|
-
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2893
|
-
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2894
|
-
df = converter.convert(df)
|
|
2895
|
-
return df
|
|
2896
|
-
|
|
2897
2856
|
@staticmethod
|
|
2898
2857
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2899
2858
|
return [
|
|
@@ -2944,7 +2903,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2944
2903
|
[
|
|
2945
2904
|
c
|
|
2946
2905
|
for c in df.columns
|
|
2947
|
-
if c not in sort_columns
|
|
2906
|
+
if c not in sort_columns
|
|
2907
|
+
and c not in sort_exclude_columns
|
|
2908
|
+
and df[c].nunique() > 1
|
|
2948
2909
|
]
|
|
2949
2910
|
# [
|
|
2950
2911
|
# sk
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import inspect
|
|
2
1
|
import logging
|
|
3
2
|
import re
|
|
4
3
|
from copy import deepcopy
|
|
@@ -382,11 +381,6 @@ class EstimatorWrapper:
|
|
|
382
381
|
kwargs["estimator"] = estimator_copy
|
|
383
382
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
384
383
|
if cat_features is not None:
|
|
385
|
-
for cat_feature in cat_features:
|
|
386
|
-
if cat_feature not in X.columns:
|
|
387
|
-
logger.error(
|
|
388
|
-
f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
|
|
389
|
-
)
|
|
390
384
|
estimator_copy.set_params(
|
|
391
385
|
cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
392
386
|
)
|
|
@@ -653,12 +647,6 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
653
647
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
654
648
|
if isinstance(scoring, str) and scoring is not None:
|
|
655
649
|
_get_scorer_by_name(scoring)
|
|
656
|
-
elif isinstance(scoring, Callable):
|
|
657
|
-
spec = inspect.getfullargspec(scoring)
|
|
658
|
-
if len(spec.args) < 3:
|
|
659
|
-
raise ValidationError(
|
|
660
|
-
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
|
|
661
|
-
)
|
|
662
650
|
|
|
663
651
|
|
|
664
652
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -44,7 +44,7 @@ class PhoneNormalizer:
|
|
|
44
44
|
Method will remove all non numeric chars from string and convert it to int.
|
|
45
45
|
None will be set for phone numbers that couldn"t be converted to int
|
|
46
46
|
"""
|
|
47
|
-
if is_string_dtype(self.df[self.phone_column_name])
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]):
|
|
48
48
|
convert_func = self.phone_str_to_int_safe
|
|
49
49
|
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
50
|
convert_func = self.phone_float_to_int_safe
|
|
@@ -38,7 +38,6 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
-
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
42
41
|
|
|
43
42
|
# Errors
|
|
44
43
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -159,7 +158,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
159
158
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
160
159
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
161
160
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
162
|
-
dataset_rarest_class_less_min=
|
|
161
|
+
dataset_rarest_class_less_min=Frequency of the rarest class `{}` is {}, minimum frequency must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
163
162
|
dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
164
163
|
dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
165
164
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
@@ -100,9 +100,6 @@ class DateTimeSearchKeyConverter:
|
|
|
100
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
101
101
|
self.logger.warning(msg)
|
|
102
102
|
raise ValidationError(msg)
|
|
103
|
-
else:
|
|
104
|
-
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
105
|
-
df[self.date_column] = self.parse_date(df)
|
|
106
103
|
|
|
107
104
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
108
105
|
# as additional features
|
|
@@ -55,7 +55,7 @@ def _get_execution_ide() -> str:
|
|
|
55
55
|
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
56
56
|
# default values
|
|
57
57
|
track = {"ide": _get_execution_ide()}
|
|
58
|
-
ident_res = "https://
|
|
58
|
+
ident_res = "https://api.ipify.org"
|
|
59
59
|
|
|
60
60
|
try:
|
|
61
61
|
track["hostname"] = socket.gethostname()
|
|
@@ -74,20 +74,17 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
74
74
|
display(
|
|
75
75
|
Javascript(
|
|
76
76
|
"""
|
|
77
|
-
|
|
78
|
-
return import('https://upgini.github.io/upgini/js/a.js')
|
|
77
|
+
import('https://upgini.github.io/upgini/js/a.js')
|
|
79
78
|
.then(FingerprintJS => FingerprintJS.load())
|
|
80
79
|
.then(fp => fp.get())
|
|
81
|
-
.then(result => result.visitorId);
|
|
82
|
-
}
|
|
80
|
+
.then(result => window.visitorId = result.visitorId);
|
|
83
81
|
"""
|
|
84
82
|
)
|
|
85
83
|
)
|
|
86
|
-
track["visitorId"] = output.eval_js("
|
|
84
|
+
track["visitorId"] = output.eval_js("window.visitorId", timeout_sec=10)
|
|
87
85
|
except Exception as e:
|
|
88
86
|
track["err"] = str(e)
|
|
89
|
-
|
|
90
|
-
track["visitorId"] = "None"
|
|
87
|
+
track["visitorId"] = "None"
|
|
91
88
|
if client_ip:
|
|
92
89
|
track["ip"] = client_ip
|
|
93
90
|
else:
|
|
@@ -98,19 +95,16 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
98
95
|
display(
|
|
99
96
|
Javascript(
|
|
100
97
|
f"""
|
|
101
|
-
|
|
102
|
-
return fetch("{ident_res}")
|
|
98
|
+
fetch("{ident_res}")
|
|
103
99
|
.then(response => response.text())
|
|
104
|
-
.then(data => data);
|
|
105
|
-
}}
|
|
100
|
+
.then(data => window.clientIP = data);
|
|
106
101
|
"""
|
|
107
102
|
)
|
|
108
103
|
)
|
|
109
|
-
track["ip"] = output.eval_js("
|
|
104
|
+
track["ip"] = output.eval_js("window.clientIP", timeout_sec=10)
|
|
110
105
|
except Exception as e:
|
|
111
106
|
track["err"] = str(e)
|
|
112
|
-
|
|
113
|
-
track["ip"] = "0.0.0.0"
|
|
107
|
+
track["ip"] = "0.0.0.0"
|
|
114
108
|
|
|
115
109
|
elif track["ide"] == "binder":
|
|
116
110
|
try:
|
|
@@ -122,10 +116,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
122
116
|
track["visitorId"] = sha256(os.environ["CLIENT_IP"].encode()).hexdigest()
|
|
123
117
|
except Exception as e:
|
|
124
118
|
track["err"] = str(e)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
if "visitorId" not in track:
|
|
128
|
-
track["visitorId"] = "None"
|
|
119
|
+
track["ip"] = "0.0.0.0"
|
|
120
|
+
track["visitorId"] = "None"
|
|
129
121
|
|
|
130
122
|
elif track["ide"] == "kaggle":
|
|
131
123
|
try:
|
|
@@ -144,8 +136,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
144
136
|
raise Exception(err)
|
|
145
137
|
except Exception as e:
|
|
146
138
|
track["err"] = str(e)
|
|
147
|
-
|
|
148
|
-
|
|
139
|
+
track["ip"] = "0.0.0.0"
|
|
140
|
+
track["visitorId"] = "None"
|
|
149
141
|
else:
|
|
150
142
|
try:
|
|
151
143
|
if client_ip:
|
|
@@ -158,9 +150,5 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
158
150
|
track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
|
|
159
151
|
except Exception as e:
|
|
160
152
|
track["err"] = str(e)
|
|
161
|
-
if "visitorId" not in track:
|
|
162
|
-
track["visitorId"] = "None"
|
|
163
|
-
if "ip" not in track:
|
|
164
|
-
track["ip"] = "0.0.0.0"
|
|
165
153
|
|
|
166
154
|
return track
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.274a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -28,7 +28,7 @@ Description-Content-Type: text/markdown
|
|
|
28
28
|
License-File: LICENSE
|
|
29
29
|
Requires-Dist: python-dateutil>=2.8.0
|
|
30
30
|
Requires-Dist: requests>=2.8.0
|
|
31
|
-
Requires-Dist: pandas<
|
|
31
|
+
Requires-Dist: pandas<2.1.0,>=1.1.0
|
|
32
32
|
Requires-Dist: numpy>=1.19.0
|
|
33
33
|
Requires-Dist: scikit-learn>=1.3.0
|
|
34
34
|
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
@@ -30,8 +30,7 @@ def test_date_diff_type2():
|
|
|
30
30
|
|
|
31
31
|
operand = DateDiffType2(left_unit="s")
|
|
32
32
|
expected_result = pd.Series([61.0, 182.0])
|
|
33
|
-
|
|
34
|
-
assert_series_equal(actual, expected_result)
|
|
33
|
+
assert_series_equal(operand.calculate_binary(df.date1, df.date2), expected_result)
|
|
35
34
|
|
|
36
35
|
|
|
37
36
|
def test_date_diff_list():
|
|
@@ -246,7 +246,7 @@ def test_eval_set_with_diff_order_of_columns(requests_mock: Mocker):
|
|
|
246
246
|
eval1_df = df[10000:11000].reset_index(drop=True)
|
|
247
247
|
eval1_features = eval1_df.drop(columns="target")
|
|
248
248
|
# shuffle columns
|
|
249
|
-
eval1_features = eval1_features[
|
|
249
|
+
eval1_features = eval1_features[set(eval1_features.columns)]
|
|
250
250
|
eval1_target = eval1_df["target"].reset_index(drop=True)
|
|
251
251
|
|
|
252
252
|
eval2_df = df[11000:12000]
|
|
@@ -375,7 +375,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
|
|
|
375
375
|
url = "http://fake_url2"
|
|
376
376
|
|
|
377
377
|
path_to_mock_features = os.path.join(
|
|
378
|
-
os.path.dirname(os.path.realpath(__file__)), "test_data/binary/
|
|
378
|
+
os.path.dirname(os.path.realpath(__file__)), "test_data/binary/validation_features.parquet"
|
|
379
379
|
)
|
|
380
380
|
|
|
381
381
|
mock_default_requests(requests_mock, url)
|
|
@@ -462,7 +462,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
|
|
|
462
462
|
segment_header: [train_segment, eval_1_segment, eval_2_segment],
|
|
463
463
|
rows_header: [10000, 1000, 1000],
|
|
464
464
|
target_mean_header: [0.5044, 0.487, 0.486],
|
|
465
|
-
enriched_gini: [0.
|
|
465
|
+
enriched_gini: [-0.000136, 0.000000, -0.003728],
|
|
466
466
|
}
|
|
467
467
|
)
|
|
468
468
|
print("Expected metrics: ")
|
|
@@ -487,13 +487,16 @@ def test_saved_features_enricher(requests_mock: Mocker):
|
|
|
487
487
|
train_random_indices = random.choice(train_target.index, size=9000, replace=False)
|
|
488
488
|
train_target.loc[train_random_indices] = 0
|
|
489
489
|
|
|
490
|
-
metrics = enricher.calculate_metrics(
|
|
490
|
+
metrics = enricher.calculate_metrics(
|
|
491
|
+
train_features,
|
|
492
|
+
train_target
|
|
493
|
+
)
|
|
491
494
|
expected_metrics = pd.DataFrame(
|
|
492
495
|
{
|
|
493
496
|
segment_header: [train_segment],
|
|
494
497
|
rows_header: [10000],
|
|
495
498
|
target_mean_header: [0.049],
|
|
496
|
-
enriched_gini: [0.
|
|
499
|
+
enriched_gini: [0.000985],
|
|
497
500
|
}
|
|
498
501
|
)
|
|
499
502
|
print("Expected metrics: ")
|
|
@@ -2227,9 +2230,8 @@ def test_email_search_key(requests_mock: Mocker):
|
|
|
2227
2230
|
"hashed_email_64ff8c",
|
|
2228
2231
|
"email_one_domain_3b0a68",
|
|
2229
2232
|
"email_domain_10c73f",
|
|
2230
|
-
"current_date_b993c4",
|
|
2231
2233
|
}
|
|
2232
|
-
assert {"hashed_email_64ff8c", "email_one_domain_3b0a68"
|
|
2234
|
+
assert {"hashed_email_64ff8c", "email_one_domain_3b0a68"} == {
|
|
2233
2235
|
sk for sublist in self.search_keys for sk in sublist
|
|
2234
2236
|
}
|
|
2235
2237
|
raise TestException()
|
|
@@ -2274,18 +2276,10 @@ def test_composit_index_search_key(requests_mock: Mocker):
|
|
|
2274
2276
|
**kwargs,
|
|
2275
2277
|
):
|
|
2276
2278
|
self.validate()
|
|
2277
|
-
assert set(self.columns.to_list()) == {
|
|
2278
|
-
"system_record_id",
|
|
2279
|
-
"country_aff64e",
|
|
2280
|
-
"postal_code_13534a",
|
|
2281
|
-
"current_date_b993c4",
|
|
2282
|
-
"target",
|
|
2283
|
-
}
|
|
2279
|
+
assert set(self.columns.to_list()) == {"system_record_id", "country_aff64e", "postal_code_13534a", "target"}
|
|
2284
2280
|
assert "country_aff64e" in self.columns
|
|
2285
2281
|
assert "postal_code_13534a"
|
|
2286
|
-
assert {"country_aff64e", "postal_code_13534a"
|
|
2287
|
-
sk for sublist in self.search_keys for sk in sublist
|
|
2288
|
-
}
|
|
2282
|
+
assert {"country_aff64e", "postal_code_13534a"} == {sk for sublist in self.search_keys for sk in sublist}
|
|
2289
2283
|
# assert "country_fake_a" in self.columns
|
|
2290
2284
|
# assert "postal_code_fake_a" in self.columns
|
|
2291
2285
|
# assert {"country_fake_a", "postal_code_fake_a"} == {sk for sublist in self.search_keys for sk in sublist}
|
|
@@ -2658,4 +2652,5 @@ class DataFrameWrapper:
|
|
|
2658
2652
|
|
|
2659
2653
|
|
|
2660
2654
|
class TestException(Exception):
|
|
2661
|
-
|
|
2655
|
+
def __init__(self):
|
|
2656
|
+
super().__init__()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|