upgini 1.1.269__tar.gz → 1.1.273__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.269/src/upgini.egg-info → upgini-1.1.273}/PKG-INFO +1 -14
- {upgini-1.1.269 → upgini-1.1.273}/setup.py +1 -1
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/all_operands.py +11 -1
- upgini-1.1.273/src/upgini/autofe/date.py +110 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/data_source/data_source_publisher.py +13 -3
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/features_enricher.py +16 -17
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/resource_bundle/strings.properties +1 -1
- {upgini-1.1.269 → upgini-1.1.273/src/upgini.egg-info}/PKG-INFO +1 -14
- upgini-1.1.273/tests/test_autofe_operands.py +93 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_metrics.py +1 -0
- upgini-1.1.269/src/upgini/autofe/date.py +0 -53
- upgini-1.1.269/tests/test_autofe_operands.py +0 -27
- {upgini-1.1.269 → upgini-1.1.273}/LICENSE +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/README.md +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/pyproject.toml +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/setup.cfg +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/ads.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/dataset.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/errors.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/http.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/metadata.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/metrics.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/search_task.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/spinner.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_country_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_email_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_target_utils.py +0 -0
- {upgini-1.1.269 → upgini-1.1.273}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.273
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -26,19 +26,6 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Requires-Python: >=3.8,<3.11
|
|
27
27
|
Description-Content-Type: text/markdown
|
|
28
28
|
License-File: LICENSE
|
|
29
|
-
Requires-Dist: python-dateutil>=2.8.0
|
|
30
|
-
Requires-Dist: requests>=2.8.0
|
|
31
|
-
Requires-Dist: pandas<2.0.0,>=1.1.0
|
|
32
|
-
Requires-Dist: numpy>=1.19.0
|
|
33
|
-
Requires-Dist: scikit-learn>=1.3.0
|
|
34
|
-
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
35
|
-
Requires-Dist: fastparquet>=0.8.1
|
|
36
|
-
Requires-Dist: python-json-logger>=2.0.2
|
|
37
|
-
Requires-Dist: catboost>=1.0.3
|
|
38
|
-
Requires-Dist: lightgbm>=3.3.2
|
|
39
|
-
Requires-Dist: pyjwt>=2.8.0
|
|
40
|
-
Requires-Dist: xhtml2pdf==0.2.11
|
|
41
|
-
Requires-Dist: ipywidgets>=8.1.0
|
|
42
29
|
|
|
43
30
|
|
|
44
31
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Dict
|
|
2
|
-
from upgini.autofe.date import DateDiff, DateDiffType2
|
|
2
|
+
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
3
|
from upgini.autofe.groupby import GroupByThenAgg, GroupByThenRank
|
|
4
4
|
from upgini.autofe.operand import Operand
|
|
5
5
|
from upgini.autofe.unary import Abs, Log, Residual, Sqrt, Square, Sigmoid, Floor, Freq
|
|
@@ -38,6 +38,16 @@ ALL_OPERANDS: Dict[str, Operand] = {
|
|
|
38
38
|
Sim(),
|
|
39
39
|
DateDiff(),
|
|
40
40
|
DateDiffType2(),
|
|
41
|
+
DateListDiff(aggregation="min"),
|
|
42
|
+
DateListDiff(aggregation="max"),
|
|
43
|
+
DateListDiff(aggregation="mean"),
|
|
44
|
+
DateListDiff(aggregation="nunique"),
|
|
45
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=0, upper_bound=18),
|
|
46
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=18, upper_bound=23),
|
|
47
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=23, upper_bound=30),
|
|
48
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=30, upper_bound=45),
|
|
49
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=45, upper_bound=60),
|
|
50
|
+
DateListDiffBounded(diff_unit="Y", aggregation="count", lower_bound=60),
|
|
41
51
|
]
|
|
42
52
|
}
|
|
43
53
|
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from typing import Any, Optional, Union
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
from upgini.autofe.operand import PandasOperand
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DateDiffMixin(BaseModel):
|
|
10
|
+
diff_unit: str = "D"
|
|
11
|
+
left_unit: Optional[str] = None
|
|
12
|
+
right_unit: Optional[str] = None
|
|
13
|
+
|
|
14
|
+
def _convert_to_date(
|
|
15
|
+
self, x: Union[pd.DataFrame, pd.Series], unit: Optional[str]
|
|
16
|
+
) -> Union[pd.DataFrame, pd.Series]:
|
|
17
|
+
if isinstance(x, pd.DataFrame):
|
|
18
|
+
return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
|
|
19
|
+
|
|
20
|
+
return pd.to_datetime(x, unit=unit)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class DateDiff(PandasOperand, DateDiffMixin):
|
|
24
|
+
name = "date_diff"
|
|
25
|
+
is_binary = True
|
|
26
|
+
has_symmetry_importance = True
|
|
27
|
+
|
|
28
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
29
|
+
left = self._convert_to_date(left, self.left_unit)
|
|
30
|
+
right = self._convert_to_date(right, self.right_unit)
|
|
31
|
+
return self.__replace_negative((left - right) / np.timedelta64(1, self.diff_unit))
|
|
32
|
+
|
|
33
|
+
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
34
|
+
x[x < 0] = None
|
|
35
|
+
return x
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
39
|
+
name = "date_diff_type2"
|
|
40
|
+
is_binary = True
|
|
41
|
+
has_symmetry_importance = True
|
|
42
|
+
|
|
43
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
44
|
+
left = self._convert_to_date(left, self.left_unit)
|
|
45
|
+
right = self._convert_to_date(right, self.right_unit)
|
|
46
|
+
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
|
+
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
|
+
)
|
|
49
|
+
before = future[future < left]
|
|
50
|
+
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
51
|
+
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
52
|
+
|
|
53
|
+
return diff
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
_ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len, 0)}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DateListDiff(PandasOperand, DateDiffMixin):
|
|
60
|
+
is_binary = True
|
|
61
|
+
has_symmetry_importance = True
|
|
62
|
+
aggregation: str
|
|
63
|
+
|
|
64
|
+
def __init__(self, **data: Any) -> None:
|
|
65
|
+
if "name" not in data:
|
|
66
|
+
data["name"] = f"date_diff_{data.get('aggregation')}"
|
|
67
|
+
super().__init__(**data)
|
|
68
|
+
|
|
69
|
+
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
70
|
+
left = self._convert_to_date(left, self.left_unit)
|
|
71
|
+
right = right.apply(lambda x: pd.arrays.DatetimeArray(self._convert_to_date(x, self.right_unit)))
|
|
72
|
+
|
|
73
|
+
return pd.Series(left - right.values).apply(lambda x: self._agg(self._diff(x)))
|
|
74
|
+
|
|
75
|
+
def _diff(self, x):
|
|
76
|
+
x = x / np.timedelta64(1, self.diff_unit)
|
|
77
|
+
return x[x > 0]
|
|
78
|
+
|
|
79
|
+
def _agg(self, x):
|
|
80
|
+
method = getattr(np, self.aggregation, None)
|
|
81
|
+
default = np.nan
|
|
82
|
+
if method is None and self.aggregation in _ext_aggregations:
|
|
83
|
+
method, default = _ext_aggregations[self.aggregation]
|
|
84
|
+
elif not callable(method):
|
|
85
|
+
raise ValueError(f"Unsupported aggregation: {self.aggregation}")
|
|
86
|
+
|
|
87
|
+
return method(x) if len(x) > 0 else default
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class DateListDiffBounded(DateListDiff):
|
|
91
|
+
lower_bound: Optional[int]
|
|
92
|
+
upper_bound: Optional[int]
|
|
93
|
+
|
|
94
|
+
def __init__(self, **data: Any) -> None:
|
|
95
|
+
if "name" not in data:
|
|
96
|
+
lower_bound = data.get("lower_bound")
|
|
97
|
+
upper_bound = data.get("upper_bound")
|
|
98
|
+
components = [
|
|
99
|
+
"date_diff",
|
|
100
|
+
data.get("diff_unit"),
|
|
101
|
+
str(lower_bound if lower_bound is not None else "minusinf"),
|
|
102
|
+
str(upper_bound if upper_bound is not None else "plusinf"),
|
|
103
|
+
]
|
|
104
|
+
components.append(data.get("aggregation"))
|
|
105
|
+
data["name"] = "_".join(components)
|
|
106
|
+
super().__init__(**data)
|
|
107
|
+
|
|
108
|
+
def _agg(self, x):
|
|
109
|
+
x = x[(x >= (self.lower_bound or -np.inf)) & (x < (self.upper_bound or np.inf))]
|
|
110
|
+
return super()._agg(x)
|
|
@@ -72,8 +72,8 @@ class DataSourcePublisher:
|
|
|
72
72
|
)
|
|
73
73
|
if search_keys is None or len(search_keys) == 0:
|
|
74
74
|
raise ValidationError("Empty search keys")
|
|
75
|
-
if SearchKey.DATE in search_keys.values() and date_format is None:
|
|
76
|
-
|
|
75
|
+
# if SearchKey.DATE in search_keys.values() and date_format is None:
|
|
76
|
+
# raise ValidationError("date_format is required for DATE search key")
|
|
77
77
|
if update_frequency not in self.ACCEPTABLE_UPDATE_FREQUENCIES:
|
|
78
78
|
raise ValidationError(
|
|
79
79
|
f"Invalid update frequency: {update_frequency}. "
|
|
@@ -85,11 +85,19 @@ class DataSourcePublisher:
|
|
|
85
85
|
or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
|
|
86
86
|
) and sort_column is None:
|
|
87
87
|
raise ValidationError("Sort column is required for passed search keys")
|
|
88
|
+
if (
|
|
89
|
+
set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
|
|
90
|
+
and snapshot_frequency_days is None
|
|
91
|
+
and join_date_abs_limit_days is None
|
|
92
|
+
):
|
|
93
|
+
raise ValidationError(
|
|
94
|
+
"With MSISDN and DATE keys one of the snapshot_frequency_days or"
|
|
95
|
+
" join_date_abs_limit_days parameters is required"
|
|
96
|
+
)
|
|
88
97
|
|
|
89
98
|
request = {
|
|
90
99
|
"dataTableUri": data_table_uri,
|
|
91
100
|
"searchKeys": {k: v.value.value for k, v in search_keys.items()},
|
|
92
|
-
"dateFormat": date_format,
|
|
93
101
|
"excludeColumns": exclude_columns,
|
|
94
102
|
"hashFeatureNames": str(hash_feature_names).lower(),
|
|
95
103
|
"snapshotFrequencyDays": snapshot_frequency_days,
|
|
@@ -98,6 +106,8 @@ class DataSourcePublisher:
|
|
|
98
106
|
"featuresForEmbeddings": features_for_embeddings,
|
|
99
107
|
"forceGeneration": str(_force_generation).lower(),
|
|
100
108
|
}
|
|
109
|
+
if date_format is not None:
|
|
110
|
+
request["dateFormat"] = date_format
|
|
101
111
|
if secondary_search_keys is not None:
|
|
102
112
|
request["secondarySearchKeys"] = {k: v.value.value for k, v in secondary_search_keys.items()}
|
|
103
113
|
if sort_column is not None:
|
|
@@ -424,7 +424,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
424
424
|
self.X = X
|
|
425
425
|
self.y = y
|
|
426
426
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
427
|
-
self.dump_input(trace_id, X, y, eval_set)
|
|
427
|
+
self.dump_input(trace_id, X, y, self.eval_set)
|
|
428
428
|
self.__inner_fit(
|
|
429
429
|
trace_id,
|
|
430
430
|
X,
|
|
@@ -563,7 +563,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
563
563
|
self.X = X
|
|
564
564
|
self.y = y
|
|
565
565
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
566
|
-
self.dump_input(trace_id, X, y, eval_set)
|
|
566
|
+
self.dump_input(trace_id, X, y, self.eval_set)
|
|
567
567
|
|
|
568
568
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
569
569
|
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(Dataset.MAX_ROWS))
|
|
@@ -823,12 +823,16 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
823
823
|
print(msg)
|
|
824
824
|
|
|
825
825
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
826
|
+
effective_X = X if X is not None else self.X
|
|
827
|
+
effective_y = y if y is not None else self.y
|
|
828
|
+
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
829
|
+
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
|
|
826
830
|
|
|
827
831
|
try:
|
|
828
832
|
self.__log_debug_information(
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
833
|
+
effective_X,
|
|
834
|
+
effective_y,
|
|
835
|
+
effective_eval_set,
|
|
832
836
|
exclude_features_sources=exclude_features_sources,
|
|
833
837
|
cv=cv if cv is not None else self.cv,
|
|
834
838
|
importance_threshold=importance_threshold,
|
|
@@ -842,17 +846,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
842
846
|
self._search_task is None
|
|
843
847
|
or self._search_task.provider_metadata_v2 is None
|
|
844
848
|
or len(self._search_task.provider_metadata_v2) == 0
|
|
845
|
-
or
|
|
846
|
-
or
|
|
849
|
+
or effective_X is None
|
|
850
|
+
or effective_y is None
|
|
847
851
|
):
|
|
848
852
|
raise ValidationError(self.bundle.get("metrics_unfitted_enricher"))
|
|
849
853
|
|
|
850
854
|
if X is not None and y is None:
|
|
851
855
|
raise ValidationError("X passed without y")
|
|
852
856
|
|
|
853
|
-
effective_X = X if X is not None else self.X
|
|
854
|
-
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
|
855
|
-
|
|
856
857
|
validate_scoring_argument(scoring)
|
|
857
858
|
|
|
858
859
|
self._validate_baseline_score(effective_X, effective_eval_set)
|
|
@@ -872,8 +873,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
872
873
|
):
|
|
873
874
|
cat_features = estimator.get_param("cat_features")
|
|
874
875
|
if len(cat_features) > 0 and isinstance(cat_features[0], int):
|
|
875
|
-
|
|
876
|
-
cat_features = [effectiveX.columns[i] for i in cat_features]
|
|
876
|
+
cat_features = [effective_X.columns[i] for i in cat_features]
|
|
877
877
|
for cat_feature in cat_features:
|
|
878
878
|
if cat_feature in self.search_keys:
|
|
879
879
|
if self.search_keys[cat_feature] in [SearchKey.COUNTRY, SearchKey.POSTAL_CODE]:
|
|
@@ -883,9 +883,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
883
883
|
|
|
884
884
|
prepared_data = self._prepare_data_for_metrics(
|
|
885
885
|
trace_id=trace_id,
|
|
886
|
-
X=
|
|
887
|
-
y=
|
|
888
|
-
eval_set=
|
|
886
|
+
X=effective_X,
|
|
887
|
+
y=effective_y,
|
|
888
|
+
eval_set=effective_eval_set,
|
|
889
889
|
exclude_features_sources=exclude_features_sources,
|
|
890
890
|
importance_threshold=importance_threshold,
|
|
891
891
|
max_features=max_features,
|
|
@@ -995,8 +995,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
995
995
|
enriched_metric = None
|
|
996
996
|
uplift = None
|
|
997
997
|
|
|
998
|
-
effective_X = X if X is not None else self.X
|
|
999
|
-
effective_y = y if y is not None else self.y
|
|
1000
998
|
train_metrics = {
|
|
1001
999
|
self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
|
|
1002
1000
|
"quality_metrics_train_segment"
|
|
@@ -2823,6 +2821,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2823
2821
|
|
|
2824
2822
|
maybe_date_col = self._get_date_column(self.search_keys)
|
|
2825
2823
|
if X is not None and maybe_date_col is not None and maybe_date_col in X.columns:
|
|
2824
|
+
# TODO cast date column to single dtype
|
|
2826
2825
|
min_date = X[maybe_date_col].min()
|
|
2827
2826
|
max_date = X[maybe_date_col].max()
|
|
2828
2827
|
self.logger.info(f"Dates interval is ({min_date}, {max_date})")
|
|
@@ -203,7 +203,7 @@ phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`.
|
|
|
203
203
|
target_type_detected=\nDetected task type: {}\n
|
|
204
204
|
# all_ok_community_invite=Chat with us in Slack community:
|
|
205
205
|
all_ok_community_invite=❓ Support request
|
|
206
|
-
too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
206
|
+
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
207
207
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
208
208
|
loss_selection_info=Using loss `{}` for feature selection
|
|
209
209
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.273
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -26,19 +26,6 @@ Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
|
26
26
|
Requires-Python: >=3.8,<3.11
|
|
27
27
|
Description-Content-Type: text/markdown
|
|
28
28
|
License-File: LICENSE
|
|
29
|
-
Requires-Dist: python-dateutil>=2.8.0
|
|
30
|
-
Requires-Dist: requests>=2.8.0
|
|
31
|
-
Requires-Dist: pandas<2.0.0,>=1.1.0
|
|
32
|
-
Requires-Dist: numpy>=1.19.0
|
|
33
|
-
Requires-Dist: scikit-learn>=1.3.0
|
|
34
|
-
Requires-Dist: pydantic<2.0.0,>=1.8.2
|
|
35
|
-
Requires-Dist: fastparquet>=0.8.1
|
|
36
|
-
Requires-Dist: python-json-logger>=2.0.2
|
|
37
|
-
Requires-Dist: catboost>=1.0.3
|
|
38
|
-
Requires-Dist: lightgbm>=3.3.2
|
|
39
|
-
Requires-Dist: pyjwt>=2.8.0
|
|
40
|
-
Requires-Dist: xhtml2pdf==0.2.11
|
|
41
|
-
Requires-Dist: ipywidgets>=8.1.0
|
|
42
29
|
|
|
43
30
|
|
|
44
31
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
from upgini.autofe.date import DateDiff, DateDiffType2, DateListDiff, DateListDiffBounded
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from pandas.testing import assert_series_equal
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_date_diff():
|
|
9
|
+
df = pd.DataFrame(
|
|
10
|
+
[
|
|
11
|
+
["2022-10-10", pd.to_datetime("1993-12-10").timestamp()],
|
|
12
|
+
["2022-10-10", pd.to_datetime("2023-10-10").timestamp()],
|
|
13
|
+
],
|
|
14
|
+
columns=["date1", "date2"],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
operand = DateDiff(right_unit="s")
|
|
18
|
+
expected_result = pd.Series([10531, None])
|
|
19
|
+
assert_series_equal(operand.calculate_binary(df.date1, df.date2), expected_result)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_date_diff_type2():
|
|
23
|
+
df = pd.DataFrame(
|
|
24
|
+
[
|
|
25
|
+
[pd.to_datetime("2022-10-10").timestamp(), datetime(1993, 12, 10)],
|
|
26
|
+
[pd.to_datetime("2022-10-10").timestamp(), datetime(1993, 4, 10)],
|
|
27
|
+
],
|
|
28
|
+
columns=["date1", "date2"],
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
operand = DateDiffType2(left_unit="s")
|
|
32
|
+
expected_result = pd.Series([61.0, 182.0])
|
|
33
|
+
assert_series_equal(operand.calculate_binary(df.date1, df.date2), expected_result)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_date_diff_list():
|
|
37
|
+
df = pd.DataFrame(
|
|
38
|
+
[
|
|
39
|
+
["2022-10-10", ["1993-12-10", "1993-12-11"]],
|
|
40
|
+
["2022-10-10", ["1993-12-10", "1993-12-10"]],
|
|
41
|
+
["2022-10-10", ["2023-10-10"]],
|
|
42
|
+
["2022-10-10", []],
|
|
43
|
+
],
|
|
44
|
+
columns=["date1", "date2"],
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def check(aggregation, expected_name, expected_values):
|
|
48
|
+
operand = DateListDiff(aggregation=aggregation)
|
|
49
|
+
assert operand.name == expected_name
|
|
50
|
+
assert_series_equal(operand.calculate_binary(df.date1, df.date2).rename(None), expected_values)
|
|
51
|
+
|
|
52
|
+
check(aggregation="min", expected_name="date_diff_min", expected_values=pd.Series([10530, 10531, None, None]))
|
|
53
|
+
check(aggregation="max", expected_name="date_diff_max", expected_values=pd.Series([10531, 10531, None, None]))
|
|
54
|
+
check(aggregation="mean", expected_name="date_diff_mean", expected_values=pd.Series([10530.5, 10531, None, None]))
|
|
55
|
+
check(aggregation="nunique", expected_name="date_diff_nunique", expected_values=pd.Series([2, 1, 0, 0]))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def test_date_diff_list_bounded():
|
|
59
|
+
df = pd.DataFrame(
|
|
60
|
+
[
|
|
61
|
+
["2022-10-10", ["2013-12-10", "2013-12-11", "1999-12-11"]],
|
|
62
|
+
[
|
|
63
|
+
"2022-10-10",
|
|
64
|
+
[
|
|
65
|
+
"2013-12-10",
|
|
66
|
+
"2003-12-11",
|
|
67
|
+
"1999-12-11",
|
|
68
|
+
"1993-12-11",
|
|
69
|
+
"1983-12-11",
|
|
70
|
+
"1973-12-11",
|
|
71
|
+
"1959-12-11",
|
|
72
|
+
],
|
|
73
|
+
],
|
|
74
|
+
["2022-10-10", ["2003-12-10", "2003-12-10"]],
|
|
75
|
+
["2022-10-10", ["2023-10-10", "1993-12-10"]],
|
|
76
|
+
["2022-10-10", []],
|
|
77
|
+
],
|
|
78
|
+
columns=["date1", "date2"],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
def check_num_by_years(lower_bound, upper_bound, expected_name, expected_values):
|
|
82
|
+
operand = DateListDiffBounded(
|
|
83
|
+
diff_unit="Y", aggregation="count", lower_bound=lower_bound, upper_bound=upper_bound
|
|
84
|
+
)
|
|
85
|
+
assert operand.name == expected_name
|
|
86
|
+
assert_series_equal(operand.calculate_binary(df.date1, df.date2).rename(None), expected_values)
|
|
87
|
+
|
|
88
|
+
check_num_by_years(0, 18, "date_diff_Y_0_18_count", pd.Series([2, 1, 0, 0, 0]))
|
|
89
|
+
check_num_by_years(18, 23, "date_diff_Y_18_23_count", pd.Series([1, 2, 2, 0, 0]))
|
|
90
|
+
check_num_by_years(23, 30, "date_diff_Y_23_30_count", pd.Series([0, 1, 0, 1, 0]))
|
|
91
|
+
check_num_by_years(30, 45, "date_diff_Y_30_45_count", pd.Series([0, 1, 0, 0, 0]))
|
|
92
|
+
check_num_by_years(45, 60, "date_diff_Y_45_60_count", pd.Series([0, 1, 0, 0, 0]))
|
|
93
|
+
check_num_by_years(60, None, "date_diff_Y_60_plusinf_count", pd.Series([0, 1, 0, 0, 0]))
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
from typing import Optional, Union
|
|
2
|
-
import numpy as np
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
from upgini.autofe.operand import PandasOperand
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class DateDiffMixin:
|
|
9
|
-
diff_unit: str = "D"
|
|
10
|
-
left_unit: Optional[str] = None
|
|
11
|
-
right_unit: Optional[str] = None
|
|
12
|
-
|
|
13
|
-
def _convert_to_date(
|
|
14
|
-
self, x: Union[pd.DataFrame, pd.Series], unit: Optional[str]
|
|
15
|
-
) -> Union[pd.DataFrame, pd.Series]:
|
|
16
|
-
if isinstance(x, pd.DataFrame):
|
|
17
|
-
return x.apply(lambda y: self._convert_to_date(y, unit), axis=1)
|
|
18
|
-
|
|
19
|
-
return pd.to_datetime(x, unit=unit)
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class DateDiff(PandasOperand, DateDiffMixin):
|
|
23
|
-
name = "date_diff"
|
|
24
|
-
is_binary = True
|
|
25
|
-
has_symmetry_importance = True
|
|
26
|
-
|
|
27
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
28
|
-
left = self._convert_to_date(left, self.left_unit)
|
|
29
|
-
right = self._convert_to_date(right, self.right_unit)
|
|
30
|
-
return self.__replace_negative((left - right) / np.timedelta64(1, self.diff_unit))
|
|
31
|
-
|
|
32
|
-
def __replace_negative(self, x: Union[pd.DataFrame, pd.Series]):
|
|
33
|
-
x[x < 0] = None
|
|
34
|
-
return x
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
38
|
-
name = "date_diff_type2"
|
|
39
|
-
is_binary = True
|
|
40
|
-
has_symmetry_importance = True
|
|
41
|
-
is_vectorizable = False
|
|
42
|
-
|
|
43
|
-
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
44
|
-
left = self._convert_to_date(left, self.left_unit)
|
|
45
|
-
right = self._convert_to_date(right, self.right_unit)
|
|
46
|
-
future = right + (left.dt.year - right.dt.year).apply(
|
|
47
|
-
lambda y: np.datetime64("NaT") if np.isnan(y) else pd.tseries.offsets.DateOffset(years=y)
|
|
48
|
-
)
|
|
49
|
-
before = future[future < left]
|
|
50
|
-
future[future < left] = before + pd.tseries.offsets.DateOffset(years=1)
|
|
51
|
-
diff = (future - left) / np.timedelta64(1, self.diff_unit)
|
|
52
|
-
|
|
53
|
-
return diff
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
from upgini.autofe.date import DateDiff, DateDiffType2
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from pandas.testing import assert_series_equal
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
def test_date_diff():
|
|
9
|
-
df = pd.DataFrame(
|
|
10
|
-
[[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(2023, 10, 10), datetime(2022, 10, 10)]],
|
|
11
|
-
columns=["date1", "date2"],
|
|
12
|
-
)
|
|
13
|
-
|
|
14
|
-
operand = DateDiff()
|
|
15
|
-
expected_result = pd.Series([10531, None])
|
|
16
|
-
assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def test_date_diff_future():
|
|
20
|
-
df = pd.DataFrame(
|
|
21
|
-
[[datetime(1993, 12, 10), datetime(2022, 10, 10)], [datetime(1993, 4, 10), datetime(2022, 10, 10)]],
|
|
22
|
-
columns=["date1", "date2"],
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
operand = DateDiffType2()
|
|
26
|
-
expected_result = pd.Series([61.0, 182.0])
|
|
27
|
-
assert_series_equal(operand.calculate_binary(df.date2, df.date1), expected_result)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|