upgini 1.1.273__tar.gz → 1.1.274__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.1.273/src/upgini.egg-info → upgini-1.1.274}/PKG-INFO +14 -1
- {upgini-1.1.273 → upgini-1.1.274}/setup.py +2 -2
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/date.py +9 -2
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/data_source/data_source_publisher.py +1 -1
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/features_enricher.py +66 -27
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/metrics.py +12 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/normalizer/phone_normalizer.py +2 -2
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/resource_bundle/strings.properties +2 -1
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/datetime_utils.py +3 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/track_info.py +25 -13
- {upgini-1.1.273 → upgini-1.1.274/src/upgini.egg-info}/PKG-INFO +14 -1
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini.egg-info/requires.txt +1 -1
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_autofe_operands.py +2 -1
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_features_enricher.py +18 -13
- {upgini-1.1.273 → upgini-1.1.274}/LICENSE +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/README.md +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/pyproject.toml +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/setup.cfg +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/ads.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/dataset.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/errors.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/http.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/metadata.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/search_task.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/spinner.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_country_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_email_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_metrics.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_target_utils.py +0 -0
- {upgini-1.1.273 → upgini-1.1.274}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.274
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -26,6 +26,19 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Requires-Python: >=3.8,<3.11
|
|
27
27
|
Description-Content-Type: text/markdown
|
|
28
28
|
License-File: LICENSE
|
|
29
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
30
|
+
Requires-Dist: requests>=2.8.0
|
|
31
|
+
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
32
|
+
Requires-Dist: numpy>=1.19.0
|
|
33
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
34
|
+
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
35
|
+
Requires-Dist: fastparquet>=0.8.1
|
|
36
|
+
Requires-Dist: python-json-logger>=2.0.2
|
|
37
|
+
Requires-Dist: catboost>=1.0.3
|
|
38
|
+
Requires-Dist: lightgbm>=3.3.2
|
|
39
|
+
Requires-Dist: pyjwt>=2.8.0
|
|
40
|
+
Requires-Dist: xhtml2pdf==0.2.11
|
|
41
|
+
Requires-Dist: ipywidgets>=8.1.0
|
|
29
42
|
|
|
30
43
|
|
|
31
44
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
@@ -40,7 +40,7 @@ def send_log(msg: str):
|
|
|
40
40
|
|
|
41
41
|
|
|
42
42
|
here = Path(__file__).parent.resolve()
|
|
43
|
-
version = "1.1.
|
|
43
|
+
version = "1.1.274"
|
|
44
44
|
try:
|
|
45
45
|
send_log(f"Start setup PyLib version {version}")
|
|
46
46
|
setup(
|
|
@@ -77,7 +77,7 @@ try:
|
|
|
77
77
|
install_requires=[
|
|
78
78
|
"python-dateutil>=2.8.0",
|
|
79
79
|
"requests>=2.8.0",
|
|
80
|
-
"pandas>=1.1.0,<
|
|
80
|
+
"pandas>=1.1.0,<3.0.0",
|
|
81
81
|
"numpy>=1.19.0",
|
|
82
82
|
"scikit-learn>=1.3.0",
|
|
83
83
|
"pydantic>=1.8.2,<2.0.0",
|
|
@@ -2,6 +2,7 @@ from typing import Any, Optional, Union
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
from pydantic import BaseModel
|
|
5
|
+
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
5
6
|
|
|
6
7
|
from upgini.autofe.operand import PandasOperand
|
|
7
8
|
|
|
@@ -46,6 +47,7 @@ class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
|
46
47
|
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
48
|
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
49
|
)
|
|
50
|
+
future = pd.to_datetime(future)
|
|
49
51
|
before = future[future < left]
|
|
50
52
|
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
51
53
|
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
@@ -72,8 +74,13 @@ class DateListDiff(PandasOperand, DateDiffMixin):
|
|
|
72
74
|
|
|
73
75
|
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
74
76
|
|
|
75
|
-
def _diff(self, x):
|
|
76
|
-
|
|
77
|
+
def _diff(self, x: TimedeltaArray):
|
|
78
|
+
if self.diff_unit == "Y":
|
|
79
|
+
x = (x / 365 / 24 / 60 / 60 / 10**9).astype(int)
|
|
80
|
+
elif self.diff_unit == "M":
|
|
81
|
+
raise Exception("Unsupported difference unit: Month")
|
|
82
|
+
else:
|
|
83
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
77
84
|
return x[x > 0]
|
|
78
85
|
|
|
79
86
|
def _agg(self, x):
|
|
@@ -48,6 +48,7 @@ class DataSourcePublisher:
|
|
|
48
48
|
data_table_uri: str,
|
|
49
49
|
search_keys: Dict[str, SearchKey],
|
|
50
50
|
update_frequency: str,
|
|
51
|
+
exclude_from_autofe_generation: Optional[List[str]],
|
|
51
52
|
secondary_search_keys: Optional[Dict[str, SearchKey]] = None,
|
|
52
53
|
sort_column: Optional[str] = None,
|
|
53
54
|
date_format: Optional[str] = None,
|
|
@@ -57,7 +58,6 @@ class DataSourcePublisher:
|
|
|
57
58
|
join_date_abs_limit_days: Optional[int] = None,
|
|
58
59
|
features_for_embeddings: Optional[List[str]] = DEFAULT_GENERATE_EMBEDDINGS,
|
|
59
60
|
data_table_id_to_replace: Optional[str] = None,
|
|
60
|
-
exclude_from_autofe_generation: Optional[List[str]] = None,
|
|
61
61
|
_force_generation=False,
|
|
62
62
|
_silent=False,
|
|
63
63
|
) -> str:
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import dataclasses
|
|
2
|
+
import datetime
|
|
2
3
|
import gc
|
|
3
4
|
import hashlib
|
|
4
5
|
import itertools
|
|
@@ -146,6 +147,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
146
147
|
"""
|
|
147
148
|
|
|
148
149
|
TARGET_NAME = "target"
|
|
150
|
+
CURRENT_DATE = "current_date"
|
|
149
151
|
RANDOM_STATE = 42
|
|
150
152
|
CALCULATE_METRICS_THRESHOLD = 50_000_000
|
|
151
153
|
CALCULATE_METRICS_MIN_THRESHOLD = 500
|
|
@@ -207,6 +209,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
207
209
|
client_ip: Optional[str] = None,
|
|
208
210
|
client_visitorid: Optional[str] = None,
|
|
209
211
|
custom_bundle_config: Optional[str] = None,
|
|
212
|
+
add_date_if_missing: bool = True,
|
|
210
213
|
**kwargs,
|
|
211
214
|
):
|
|
212
215
|
self.bundle = get_custom_bundle(custom_bundle_config)
|
|
@@ -317,6 +320,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
317
320
|
self.raise_validation_error = raise_validation_error
|
|
318
321
|
self.exclude_columns = exclude_columns
|
|
319
322
|
self.baseline_score_column = baseline_score_column
|
|
323
|
+
self.add_date_if_missing = add_date_if_missing
|
|
320
324
|
|
|
321
325
|
def _get_api_key(self):
|
|
322
326
|
return self._api_key
|
|
@@ -420,6 +424,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
420
424
|
|
|
421
425
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
422
426
|
|
|
427
|
+
# Validate client estimator params
|
|
428
|
+
self._get_client_cat_features(estimator, X, self.search_keys)
|
|
429
|
+
|
|
423
430
|
try:
|
|
424
431
|
self.X = X
|
|
425
432
|
self.y = y
|
|
@@ -813,6 +820,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
813
820
|
trace_id = trace_id or str(uuid.uuid4())
|
|
814
821
|
start_time = time.time()
|
|
815
822
|
with MDC(trace_id=trace_id):
|
|
823
|
+
self.logger.info("Start calculate metrics")
|
|
816
824
|
if len(args) > 0:
|
|
817
825
|
msg = f"WARNING: Unsupported positional arguments for calculate_metrics: {args}"
|
|
818
826
|
self.logger.warning(msg)
|
|
@@ -864,22 +872,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
864
872
|
self.__display_support_link(msg)
|
|
865
873
|
return None
|
|
866
874
|
|
|
867
|
-
cat_features =
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
estimator is not None
|
|
871
|
-
and hasattr(estimator, "get_param")
|
|
872
|
-
and estimator.get_param("cat_features") is not None
|
|
873
|
-
):
|
|
874
|
-
cat_features = estimator.get_param("cat_features")
|
|
875
|
-
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
876
|
-
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
877
|
-
for cat_feature in cat_features:
|
|
878
|
-
if cat_feature in self.search_keys:
|
|
879
|
-
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
880
|
-
search_keys_for_metrics.append(cat_feature)
|
|
881
|
-
else:
|
|
882
|
-
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
875
|
+
cat_features, search_keys_for_metrics = self._get_client_cat_features(
|
|
876
|
+
estimator, effective_X, self.search_keys
|
|
877
|
+
)
|
|
883
878
|
|
|
884
879
|
prepared_data = self._prepare_data_for_metrics(
|
|
885
880
|
trace_id=trace_id,
|
|
@@ -894,6 +889,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
894
889
|
search_keys_for_metrics=search_keys_for_metrics,
|
|
895
890
|
progress_bar=progress_bar,
|
|
896
891
|
progress_callback=progress_callback,
|
|
892
|
+
cat_features=cat_features,
|
|
897
893
|
)
|
|
898
894
|
if prepared_data is None:
|
|
899
895
|
return None
|
|
@@ -1269,6 +1265,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1269
1265
|
|
|
1270
1266
|
return _cv, groups
|
|
1271
1267
|
|
|
1268
|
+
def _get_client_cat_features(
|
|
1269
|
+
self, estimator: Optional[Any], X: pd.DataFrame, search_keys: Dict[str, SearchKey]
|
|
1270
|
+
) -> Optional[List[str]]:
|
|
1271
|
+
cat_features = None
|
|
1272
|
+
search_keys_for_metrics = []
|
|
1273
|
+
if (
|
|
1274
|
+
estimator is not None
|
|
1275
|
+
and hasattr(estimator, "get_param")
|
|
1276
|
+
and estimator.get_param("cat_features") is not None
|
|
1277
|
+
):
|
|
1278
|
+
cat_features = estimator.get_param("cat_features")
|
|
1279
|
+
if len(cat_features) > 0:
|
|
1280
|
+
if all([isinstance(f, int) for f in cat_features]):
|
|
1281
|
+
cat_features = [X.columns[i] for i in cat_features]
|
|
1282
|
+
self.logger.info(f"Collected categorical features {cat_features} from user estimator")
|
|
1283
|
+
for cat_feature in cat_features:
|
|
1284
|
+
if cat_feature in search_keys:
|
|
1285
|
+
if search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
1286
|
+
search_keys_for_metrics.append(cat_feature)
|
|
1287
|
+
else:
|
|
1288
|
+
raise ValidationError(self.bundle.get("cat_feature_search_key").format(cat_feature))
|
|
1289
|
+
return cat_features, search_keys_for_metrics
|
|
1290
|
+
|
|
1272
1291
|
def _prepare_data_for_metrics(
|
|
1273
1292
|
self,
|
|
1274
1293
|
trace_id: str,
|
|
@@ -1283,6 +1302,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1283
1302
|
search_keys_for_metrics: Optional[List[str]] = None,
|
|
1284
1303
|
progress_bar: Optional[ProgressBar] = None,
|
|
1285
1304
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
1305
|
+
cat_features: Optional[List[str]] = None,
|
|
1286
1306
|
):
|
|
1287
1307
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
|
1288
1308
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
|
@@ -1340,9 +1360,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1340
1360
|
|
|
1341
1361
|
# Detect and drop high cardinality columns in train
|
|
1342
1362
|
columns_with_high_cardinality = FeaturesValidator.find_high_cardinality(fitting_X)
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
]
|
|
1363
|
+
non_excluding_columns = (self.generate_features or []) + (cat_features or [])
|
|
1364
|
+
columns_with_high_cardinality = [c for c in columns_with_high_cardinality if c not in non_excluding_columns]
|
|
1346
1365
|
if len(columns_with_high_cardinality) > 0:
|
|
1347
1366
|
self.logger.warning(
|
|
1348
1367
|
f"High cardinality columns {columns_with_high_cardinality} will be dropped for metrics calculation"
|
|
@@ -1804,10 +1823,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1804
1823
|
else:
|
|
1805
1824
|
features_section = ""
|
|
1806
1825
|
|
|
1807
|
-
|
|
1826
|
+
search_id = self._search_task.search_task_id
|
|
1827
|
+
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
1808
1828
|
-H 'Authorization: {self.api_key}' \\
|
|
1809
1829
|
-H 'Content-Type: application/json' \\
|
|
1810
|
-
-d '{{"
|
|
1830
|
+
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
1811
1831
|
return api_example
|
|
1812
1832
|
|
|
1813
1833
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -1902,6 +1922,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1902
1922
|
generated_features.extend(converter.generated_features)
|
|
1903
1923
|
else:
|
|
1904
1924
|
self.logger.info("Input dataset hasn't date column")
|
|
1925
|
+
if self.add_date_if_missing:
|
|
1926
|
+
df = self._add_current_date_as_key(df, search_keys, self.logger, self.bundle)
|
|
1905
1927
|
email_column = self._get_email_column(search_keys)
|
|
1906
1928
|
hem_column = self._get_hem_column(search_keys)
|
|
1907
1929
|
email_converted_to_hem = False
|
|
@@ -2220,9 +2242,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2220
2242
|
self.fit_search_keys = self.search_keys.copy()
|
|
2221
2243
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2222
2244
|
|
|
2223
|
-
validate_dates_distribution(
|
|
2224
|
-
validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
|
|
2225
|
-
)
|
|
2245
|
+
validate_dates_distribution(validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter)
|
|
2226
2246
|
|
|
2227
2247
|
maybe_date_column = self._get_date_column(self.fit_search_keys)
|
|
2228
2248
|
has_date = maybe_date_column is not None
|
|
@@ -2273,6 +2293,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2273
2293
|
self.fit_generated_features.extend(converter.generated_features)
|
|
2274
2294
|
else:
|
|
2275
2295
|
self.logger.info("Input dataset hasn't date column")
|
|
2296
|
+
if self.add_date_if_missing:
|
|
2297
|
+
df = self._add_current_date_as_key(df, self.fit_search_keys, self.logger, self.bundle)
|
|
2276
2298
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
2277
2299
|
hem_column = self._get_hem_column(self.fit_search_keys)
|
|
2278
2300
|
email_converted_to_hem = False
|
|
@@ -2853,6 +2875,25 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2853
2875
|
if t in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
2854
2876
|
return col
|
|
2855
2877
|
|
|
2878
|
+
@staticmethod
|
|
2879
|
+
def _add_current_date_as_key(
|
|
2880
|
+
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
2881
|
+
) -> pd.DataFrame:
|
|
2882
|
+
if (
|
|
2883
|
+
set(search_keys.values()) == {SearchKey.PHONE}
|
|
2884
|
+
or set(search_keys.values()) == {SearchKey.EMAIL}
|
|
2885
|
+
or set(search_keys.values()) == {SearchKey.HEM}
|
|
2886
|
+
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
2887
|
+
):
|
|
2888
|
+
msg = bundle.get("current_date_added")
|
|
2889
|
+
print(msg)
|
|
2890
|
+
logger.warning(msg)
|
|
2891
|
+
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
2892
|
+
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
2893
|
+
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE, None, logger, bundle)
|
|
2894
|
+
df = converter.convert(df)
|
|
2895
|
+
return df
|
|
2896
|
+
|
|
2856
2897
|
@staticmethod
|
|
2857
2898
|
def _get_group_columns(df: pd.DataFrame, search_keys: Dict[str, SearchKey]) -> List[str]:
|
|
2858
2899
|
return [
|
|
@@ -2903,9 +2944,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2903
2944
|
[
|
|
2904
2945
|
c
|
|
2905
2946
|
for c in df.columns
|
|
2906
|
-
if c not in sort_columns
|
|
2907
|
-
and c not in sort_exclude_columns
|
|
2908
|
-
and df[c].nunique() > 1
|
|
2947
|
+
if c not in sort_columns and c not in sort_exclude_columns and df[c].nunique() > 1
|
|
2909
2948
|
]
|
|
2910
2949
|
# [
|
|
2911
2950
|
# sk
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import logging
|
|
2
3
|
import re
|
|
3
4
|
from copy import deepcopy
|
|
@@ -381,6 +382,11 @@ class EstimatorWrapper:
|
|
|
381
382
|
kwargs["estimator"] = estimator_copy
|
|
382
383
|
if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
|
|
383
384
|
if cat_features is not None:
|
|
385
|
+
for cat_feature in cat_features:
|
|
386
|
+
if cat_feature not in X.columns:
|
|
387
|
+
logger.error(
|
|
388
|
+
f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
|
|
389
|
+
)
|
|
384
390
|
estimator_copy.set_params(
|
|
385
391
|
cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
386
392
|
)
|
|
@@ -647,6 +653,12 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
647
653
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
648
654
|
if isinstance(scoring, str) and scoring is not None:
|
|
649
655
|
_get_scorer_by_name(scoring)
|
|
656
|
+
elif isinstance(scoring, Callable):
|
|
657
|
+
spec = inspect.getfullargspec(scoring)
|
|
658
|
+
if len(spec.args) < 3:
|
|
659
|
+
raise ValidationError(
|
|
660
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
|
|
661
|
+
)
|
|
650
662
|
|
|
651
663
|
|
|
652
664
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
|
-
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
|
|
4
|
+
from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
|
|
5
5
|
|
|
6
6
|
from upgini.errors import ValidationError
|
|
7
7
|
|
|
@@ -44,7 +44,7 @@ class PhoneNormalizer:
|
|
|
44
44
|
Method will remove all non numeric chars from string and convert it to int.
|
|
45
45
|
None will be set for phone numbers that couldn"t be converted to int
|
|
46
46
|
"""
|
|
47
|
-
if is_string_dtype(self.df[self.phone_column_name]):
|
|
47
|
+
if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
|
|
48
48
|
convert_func = self.phone_str_to_int_safe
|
|
49
49
|
elif is_float_dtype(self.df[self.phone_column_name]):
|
|
50
50
|
convert_func = self.phone_float_to_int_safe
|
|
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
|
|
|
38
38
|
loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
|
|
39
39
|
multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
|
|
40
40
|
group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
|
|
41
|
+
current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
|
|
41
42
|
|
|
42
43
|
# Errors
|
|
43
44
|
failed_search_by_task_id=Failed to retrieve the specified search results
|
|
@@ -158,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
|
|
|
158
159
|
dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
|
|
159
160
|
dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
|
|
160
161
|
dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
|
|
161
|
-
dataset_rarest_class_less_min=
|
|
162
|
+
dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
|
|
162
163
|
dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
|
|
163
164
|
dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
|
|
164
165
|
dataset_too_many_features=Too many features. Maximum number of features is {}
|
|
@@ -100,6 +100,9 @@ class DateTimeSearchKeyConverter:
|
|
|
100
100
|
msg = self.bundle.get("unsupported_date_type").format(self.date_column)
|
|
101
101
|
self.logger.warning(msg)
|
|
102
102
|
raise ValidationError(msg)
|
|
103
|
+
else:
|
|
104
|
+
df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
|
|
105
|
+
df[self.date_column] = self.parse_date(df)
|
|
103
106
|
|
|
104
107
|
# If column with date is datetime then extract seconds of the day and minute of the hour
|
|
105
108
|
# as additional features
|
|
@@ -55,7 +55,7 @@ def _get_execution_ide() -> str:
|
|
|
55
55
|
def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
|
|
56
56
|
# default values
|
|
57
57
|
track = {"ide": _get_execution_ide()}
|
|
58
|
-
ident_res = "https://
|
|
58
|
+
ident_res = "https://api64.ipify.org"
|
|
59
59
|
|
|
60
60
|
try:
|
|
61
61
|
track["hostname"] = socket.gethostname()
|
|
@@ -74,17 +74,20 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
74
74
|
display(
|
|
75
75
|
Javascript(
|
|
76
76
|
"""
|
|
77
|
-
|
|
77
|
+
async function getVisitorId() {
|
|
78
|
+
return import('https://upgini.github.io/upgini/js/a.js')
|
|
78
79
|
.then(FingerprintJS => FingerprintJS.load())
|
|
79
80
|
.then(fp => fp.get())
|
|
80
|
-
.then(result =>
|
|
81
|
+
.then(result => result.visitorId);
|
|
82
|
+
}
|
|
81
83
|
"""
|
|
82
84
|
)
|
|
83
85
|
)
|
|
84
|
-
track["visitorId"] = output.eval_js("
|
|
86
|
+
track["visitorId"] = output.eval_js("getVisitorId()", timeout_sec=30)
|
|
85
87
|
except Exception as e:
|
|
86
88
|
track["err"] = str(e)
|
|
87
|
-
|
|
89
|
+
if "visitorId" not in track:
|
|
90
|
+
track["visitorId"] = "None"
|
|
88
91
|
if client_ip:
|
|
89
92
|
track["ip"] = client_ip
|
|
90
93
|
else:
|
|
@@ -95,16 +98,19 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
95
98
|
display(
|
|
96
99
|
Javascript(
|
|
97
100
|
f"""
|
|
98
|
-
|
|
101
|
+
async function getIP() {{
|
|
102
|
+
return fetch("{ident_res}")
|
|
99
103
|
.then(response => response.text())
|
|
100
|
-
.then(data =>
|
|
104
|
+
.then(data => data);
|
|
105
|
+
}}
|
|
101
106
|
"""
|
|
102
107
|
)
|
|
103
108
|
)
|
|
104
|
-
track["ip"] = output.eval_js("
|
|
109
|
+
track["ip"] = output.eval_js("getIP()", timeout_sec=10)
|
|
105
110
|
except Exception as e:
|
|
106
111
|
track["err"] = str(e)
|
|
107
|
-
|
|
112
|
+
if "ip" not in track:
|
|
113
|
+
track["ip"] = "0.0.0.0"
|
|
108
114
|
|
|
109
115
|
elif track["ide"] == "binder":
|
|
110
116
|
try:
|
|
@@ -116,8 +122,10 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
116
122
|
track["visitorId"] = sha256(os.environ["CLIENT_IP"].encode()).hexdigest()
|
|
117
123
|
except Exception as e:
|
|
118
124
|
track["err"] = str(e)
|
|
119
|
-
|
|
120
|
-
|
|
125
|
+
if "ip" not in track:
|
|
126
|
+
track["ip"] = "0.0.0.0"
|
|
127
|
+
if "visitorId" not in track:
|
|
128
|
+
track["visitorId"] = "None"
|
|
121
129
|
|
|
122
130
|
elif track["ide"] == "kaggle":
|
|
123
131
|
try:
|
|
@@ -136,8 +144,8 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
136
144
|
raise Exception(err)
|
|
137
145
|
except Exception as e:
|
|
138
146
|
track["err"] = str(e)
|
|
139
|
-
|
|
140
|
-
|
|
147
|
+
if "visitorId" not in track:
|
|
148
|
+
track["visitorId"] = "None"
|
|
141
149
|
else:
|
|
142
150
|
try:
|
|
143
151
|
if client_ip:
|
|
@@ -150,5 +158,9 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
|
|
|
150
158
|
track["visitorId"] = sha256(str(getnode()).encode()).hexdigest()
|
|
151
159
|
except Exception as e:
|
|
152
160
|
track["err"] = str(e)
|
|
161
|
+
if "visitorId" not in track:
|
|
162
|
+
track["visitorId"] = "None"
|
|
163
|
+
if "ip" not in track:
|
|
164
|
+
track["ip"] = "0.0.0.0"
|
|
153
165
|
|
|
154
166
|
return track
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.274
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -26,6 +26,19 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Requires-Python: >=3.8,<3.11
|
|
27
27
|
Description-Content-Type: text/markdown
|
|
28
28
|
License-File: LICENSE
|
|
29
|
+
Requires-Dist: python-dateutil>=2.8.0
|
|
30
|
+
Requires-Dist: requests>=2.8.0
|
|
31
|
+
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
32
|
+
Requires-Dist: numpy>=1.19.0
|
|
33
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
34
|
+
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
35
|
+
Requires-Dist: fastparquet>=0.8.1
|
|
36
|
+
Requires-Dist: python-json-logger>=2.0.2
|
|
37
|
+
Requires-Dist: catboost>=1.0.3
|
|
38
|
+
Requires-Dist: lightgbm>=3.3.2
|
|
39
|
+
Requires-Dist: pyjwt>=2.8.0
|
|
40
|
+
Requires-Dist: xhtml2pdf==0.2.11
|
|
41
|
+
Requires-Dist: ipywidgets>=8.1.0
|
|
29
42
|
|
|
30
43
|
|
|
31
44
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
@@ -30,7 +30,8 @@ def test_date_diff_type2():
|
|
|
30
30
|
|
|
31
31
|
operand = DateDiffType2(left_unit="s")
|
|
32
32
|
expected_result = pd.Series([61.0, 182.0])
|
|
33
|
-
|
|
33
|
+
actual = operand.calculate_binary(df.date1, df.date2)
|
|
34
|
+
assert_series_equal(actual, expected_result)
|
|
34
35
|
|
|
35
36
|
|
|
36
37
|
def test_date_diff_list():
|
|
@@ -246,7 +246,7 @@ def test_eval_set_with_diff_order_of_columns(requests_mock: Mocker):
|
|
|
246
246
|
eval1_df = df[10000:11000].reset_index(drop=True)
|
|
247
247
|
eval1_features = eval1_df.drop(columns="target")
|
|
248
248
|
# shuffle columns
|
|
249
|
-
eval1_features = eval1_features[
|
|
249
|
+
eval1_features = eval1_features[list(eval1_features.columns)]
|
|
250
250
|
eval1_target = eval1_df["target"].reset_index(drop=True)
|
|
251
251
|
|
|
252
252
|
eval2_df = df[11000:12000]
|
|
@@ -375,7 +375,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
|
|
|
375
375
|
url = "http://fake_url2"
|
|
376
376
|
|
|
377
377
|
path_to_mock_features = os.path.join(
|
|
378
|
-
os.path.dirname(os.path.realpath(__file__)), "test_data/binary/
|
|
378
|
+
os.path.dirname(os.path.realpath(__file__)), "test_data/binary/validation_features_v3.parquet"
|
|
379
379
|
)
|
|
380
380
|
|
|
381
381
|
mock_default_requests(requests_mock, url)
|
|
@@ -462,7 +462,7 @@ def test_saved_features_enricher(requests_mock: Mocker):
|
|
|
462
462
|
segment_header: [train_segment, eval_1_segment, eval_2_segment],
|
|
463
463
|
rows_header: [10000, 1000, 1000],
|
|
464
464
|
target_mean_header: [0.5044, 0.487, 0.486],
|
|
465
|
-
enriched_gini: [
|
|
465
|
+
enriched_gini: [0.021830, -0.006607, -0.018483],
|
|
466
466
|
}
|
|
467
467
|
)
|
|
468
468
|
print("Expected metrics: ")
|
|
@@ -487,16 +487,13 @@ def test_saved_features_enricher(requests_mock: Mocker):
|
|
|
487
487
|
train_random_indices = random.choice(train_target.index, size=9000, replace=False)
|
|
488
488
|
train_target.loc[train_random_indices] = 0
|
|
489
489
|
|
|
490
|
-
metrics = enricher.calculate_metrics(
|
|
491
|
-
train_features,
|
|
492
|
-
train_target
|
|
493
|
-
)
|
|
490
|
+
metrics = enricher.calculate_metrics(train_features, train_target)
|
|
494
491
|
expected_metrics = pd.DataFrame(
|
|
495
492
|
{
|
|
496
493
|
segment_header: [train_segment],
|
|
497
494
|
rows_header: [10000],
|
|
498
495
|
target_mean_header: [0.049],
|
|
499
|
-
enriched_gini: [0.
|
|
496
|
+
enriched_gini: [0.054454],
|
|
500
497
|
}
|
|
501
498
|
)
|
|
502
499
|
print("Expected metrics: ")
|
|
@@ -2230,8 +2227,9 @@ def test_email_search_key(requests_mock: Mocker):
|
|
|
2230
2227
|
"hashed_email_64ff8c",
|
|
2231
2228
|
"email_one_domain_3b0a68",
|
|
2232
2229
|
"email_domain_10c73f",
|
|
2230
|
+
"current_date_b993c4",
|
|
2233
2231
|
}
|
|
2234
|
-
assert {"hashed_email_64ff8c", "email_one_domain_3b0a68"} == {
|
|
2232
|
+
assert {"hashed_email_64ff8c", "email_one_domain_3b0a68", "current_date_b993c4"} == {
|
|
2235
2233
|
sk for sublist in self.search_keys for sk in sublist
|
|
2236
2234
|
}
|
|
2237
2235
|
raise TestException()
|
|
@@ -2276,10 +2274,18 @@ def test_composit_index_search_key(requests_mock: Mocker):
|
|
|
2276
2274
|
**kwargs,
|
|
2277
2275
|
):
|
|
2278
2276
|
self.validate()
|
|
2279
|
-
assert set(self.columns.to_list()) == {
|
|
2277
|
+
assert set(self.columns.to_list()) == {
|
|
2278
|
+
"system_record_id",
|
|
2279
|
+
"country_aff64e",
|
|
2280
|
+
"postal_code_13534a",
|
|
2281
|
+
"current_date_b993c4",
|
|
2282
|
+
"target",
|
|
2283
|
+
}
|
|
2280
2284
|
assert "country_aff64e" in self.columns
|
|
2281
2285
|
assert "postal_code_13534a"
|
|
2282
|
-
assert {"country_aff64e", "postal_code_13534a"} == {
|
|
2286
|
+
assert {"country_aff64e", "postal_code_13534a", "current_date_b993c4"} == {
|
|
2287
|
+
sk for sublist in self.search_keys for sk in sublist
|
|
2288
|
+
}
|
|
2283
2289
|
# assert "country_fake_a" in self.columns
|
|
2284
2290
|
# assert "postal_code_fake_a" in self.columns
|
|
2285
2291
|
# assert {"country_fake_a", "postal_code_fake_a"} == {sk for sublist in self.search_keys for sk in sublist}
|
|
@@ -2652,5 +2658,4 @@ class DataFrameWrapper:
|
|
|
2652
2658
|
|
|
2653
2659
|
|
|
2654
2660
|
class TestException(Exception):
|
|
2655
|
-
|
|
2656
|
-
super().__init__()
|
|
2661
|
+
pass
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|