upgini 1.1.264a1__tar.gz → 1.1.265__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.264a1/src/upgini.egg-info → upgini-1.1.265}/PKG-INFO +1 -1
- {upgini-1.1.264a1 → upgini-1.1.265}/setup.py +1 -1
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/features_enricher.py +6 -1
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/datetime_utils.py +49 -1
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/target_utils.py +2 -6
- {upgini-1.1.264a1 → upgini-1.1.265/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_datetime_utils.py +30 -2
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_features_enricher.py +2 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/LICENSE +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/README.md +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/pyproject.toml +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/setup.cfg +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/ads.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/dataset.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/errors.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/http.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/metadata.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/metrics.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/search_task.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/spinner.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_autofe_operands.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_country_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_email_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_metrics.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_target_utils.py +0 -0
- {upgini-1.1.264a1 → upgini-1.1.265}/tests/test_widget.py +0 -0
|
@@ -70,6 +70,7 @@ from upgini.utils.datetime_utils import (
|
|
|
70
70
|
DateTimeSearchKeyConverter,
|
|
71
71
|
is_blocked_time_series,
|
|
72
72
|
is_time_series,
|
|
73
|
+
validate_dates_distribution,
|
|
73
74
|
)
|
|
74
75
|
from upgini.utils.deduplicate_utils import (
|
|
75
76
|
clean_full_duplicates,
|
|
@@ -1922,7 +1923,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1922
1923
|
|
|
1923
1924
|
meaning_types = {col: key.value for col, key in search_keys.items()}
|
|
1924
1925
|
non_keys_columns = [column for column in df.columns if column not in search_keys.keys()]
|
|
1925
|
-
|
|
1926
|
+
|
|
1926
1927
|
if email_converted_to_hem:
|
|
1927
1928
|
non_keys_columns.append(email_column)
|
|
1928
1929
|
|
|
@@ -2221,6 +2222,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2221
2222
|
self.fit_search_keys = self.search_keys.copy()
|
|
2222
2223
|
self.fit_search_keys = self.__prepare_search_keys(validated_X, self.fit_search_keys, is_demo_dataset)
|
|
2223
2224
|
|
|
2225
|
+
validate_dates_distribution(
|
|
2226
|
+
validated_X, self.fit_search_keys, self.logger, self.bundle, self.warning_counter
|
|
2227
|
+
)
|
|
2228
|
+
|
|
2224
2229
|
has_date = self._get_date_column(self.fit_search_keys) is not None
|
|
2225
2230
|
model_task_type = self.model_task_type or define_task(validated_y, has_date, self.logger)
|
|
2226
2231
|
self._validate_binary_observations(validated_y, model_task_type)
|
|
@@ -111,6 +111,7 @@ x_is_empty=X is empty
|
|
|
111
111
|
y_is_empty=y is empty
|
|
112
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
113
|
missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
+
x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample.
|
|
114
115
|
# eval set validation
|
|
115
116
|
unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
|
|
116
117
|
eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import datetime
|
|
2
2
|
import logging
|
|
3
3
|
import re
|
|
4
|
-
from typing import List, Optional
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
5
|
|
|
6
6
|
import numpy as np
|
|
7
7
|
import pandas as pd
|
|
@@ -9,7 +9,9 @@ from dateutil.relativedelta import relativedelta
|
|
|
9
9
|
from pandas.api.types import is_numeric_dtype, is_period_dtype, is_string_dtype
|
|
10
10
|
|
|
11
11
|
from upgini.errors import ValidationError
|
|
12
|
+
from upgini.metadata import SearchKey
|
|
12
13
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
14
|
+
from upgini.utils.warning_counter import WarningCounter
|
|
13
15
|
|
|
14
16
|
DATE_FORMATS = [
|
|
15
17
|
"%Y-%m-%d",
|
|
@@ -225,3 +227,49 @@ def is_blocked_time_series(df: pd.DataFrame, date_col: str, search_keys: List[st
|
|
|
225
227
|
|
|
226
228
|
is_diff_less_than_two_columns = grouped.apply(check_differences)
|
|
227
229
|
return is_diff_less_than_two_columns.all()
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def validate_dates_distribution(
|
|
233
|
+
X: pd.DataFrame,
|
|
234
|
+
search_keys: Dict[str, SearchKey],
|
|
235
|
+
logger: Optional[logging.Logger] = None,
|
|
236
|
+
bundle: Optional[ResourceBundle] = None,
|
|
237
|
+
warning_counter: Optional[WarningCounter] = None,
|
|
238
|
+
):
|
|
239
|
+
maybe_date_col = None
|
|
240
|
+
for key, key_type in search_keys.items():
|
|
241
|
+
if key_type in [SearchKey.DATE, SearchKey.DATETIME]:
|
|
242
|
+
maybe_date_col = key
|
|
243
|
+
|
|
244
|
+
if maybe_date_col is None:
|
|
245
|
+
for col in X.columns:
|
|
246
|
+
if col in search_keys:
|
|
247
|
+
continue
|
|
248
|
+
try:
|
|
249
|
+
pd.to_datetime(X[col])
|
|
250
|
+
maybe_date_col = col
|
|
251
|
+
break
|
|
252
|
+
except Exception:
|
|
253
|
+
pass
|
|
254
|
+
|
|
255
|
+
if maybe_date_col is None:
|
|
256
|
+
return
|
|
257
|
+
|
|
258
|
+
dates = pd.to_datetime(X[maybe_date_col]).dt.date
|
|
259
|
+
|
|
260
|
+
date_counts = dates.value_counts().sort_index()
|
|
261
|
+
|
|
262
|
+
date_counts_1 = date_counts[: round(len(date_counts) / 2)]
|
|
263
|
+
date_counts_2 = date_counts[round(len(date_counts) / 2) :]
|
|
264
|
+
ratio = date_counts_2.mean() / date_counts_1.mean()
|
|
265
|
+
|
|
266
|
+
if ratio > 1.2 or ratio < 0.8:
|
|
267
|
+
if warning_counter is not None:
|
|
268
|
+
warning_counter.increment()
|
|
269
|
+
if logger is None:
|
|
270
|
+
logger = logging.getLogger("muted_logger")
|
|
271
|
+
logger.setLevel("FATAL")
|
|
272
|
+
bundle = bundle or get_custom_bundle()
|
|
273
|
+
msg = bundle.get("x_unstable_by_date")
|
|
274
|
+
print(msg)
|
|
275
|
+
logger.warning(msg)
|
|
@@ -132,9 +132,7 @@ def balance_undersample(
|
|
|
132
132
|
class_value = classes[class_idx]
|
|
133
133
|
class_count = vc[class_value]
|
|
134
134
|
sample_strategy[class_value] = min(class_count, quantile25_class_cnt * multiclass_bootstrap_loops)
|
|
135
|
-
sampler = RandomUnderSampler(
|
|
136
|
-
sampling_strategy=sample_strategy, random_state=random_state
|
|
137
|
-
)
|
|
135
|
+
sampler = RandomUnderSampler(sampling_strategy=sample_strategy, random_state=random_state)
|
|
138
136
|
X = df[SYSTEM_RECORD_ID]
|
|
139
137
|
X = X.to_frame(SYSTEM_RECORD_ID)
|
|
140
138
|
new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
@@ -153,9 +151,7 @@ def balance_undersample(
|
|
|
153
151
|
minority_class = df[df[target_column] == min_class_value]
|
|
154
152
|
majority_class = df[df[target_column] != min_class_value]
|
|
155
153
|
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
156
|
-
sampled_majority_class = majority_class.sample(
|
|
157
|
-
n=sample_size, random_state=random_state
|
|
158
|
-
)
|
|
154
|
+
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
159
155
|
resampled_data = df[
|
|
160
156
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
161
157
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
@@ -1,7 +1,13 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
1
|
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
3
|
|
|
4
|
-
from upgini.
|
|
4
|
+
from upgini.metadata import SearchKey
|
|
5
|
+
from upgini.utils.datetime_utils import (
|
|
6
|
+
is_blocked_time_series,
|
|
7
|
+
is_time_series,
|
|
8
|
+
validate_dates_distribution,
|
|
9
|
+
)
|
|
10
|
+
from upgini.utils.warning_counter import WarningCounter
|
|
5
11
|
|
|
6
12
|
pd.set_option("mode.chained_assignment", "raise")
|
|
7
13
|
|
|
@@ -183,3 +189,25 @@ def test_multivariate_time_series():
|
|
|
183
189
|
assert not is_blocked_time_series(df, "date", ["date"])
|
|
184
190
|
|
|
185
191
|
assert is_blocked_time_series(df, "date", ["date", "feature3"])
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def test_validate_dates_distribution():
|
|
195
|
+
df = pd.DataFrame({"date": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
|
|
196
|
+
warning_counter = WarningCounter()
|
|
197
|
+
validate_dates_distribution(df, {}, warning_counter=warning_counter)
|
|
198
|
+
assert warning_counter.has_warnings()
|
|
199
|
+
|
|
200
|
+
df = pd.DataFrame({"date": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40})
|
|
201
|
+
warning_counter = WarningCounter()
|
|
202
|
+
validate_dates_distribution(df, {}, warning_counter=warning_counter)
|
|
203
|
+
assert not warning_counter.has_warnings()
|
|
204
|
+
|
|
205
|
+
df = pd.DataFrame(
|
|
206
|
+
{
|
|
207
|
+
"date2": ["2020-05-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
|
|
208
|
+
"date1": ["2020-01-01"] * 10 + ["2020-02-01"] * 20 + ["2020-03-01"] * 30 + ["2020-04-01"] * 40,
|
|
209
|
+
}
|
|
210
|
+
)
|
|
211
|
+
warning_counter = WarningCounter()
|
|
212
|
+
validate_dates_distribution(df, {"date1": SearchKey.DATE}, warning_counter=warning_counter)
|
|
213
|
+
assert warning_counter.has_warnings()
|
|
@@ -2164,6 +2164,8 @@ def test_idempotent_order_with_imbalanced_dataset(requests_mock: Mocker):
|
|
|
2164
2164
|
|
|
2165
2165
|
actual_result_df = result_wrapper.df.sort_values(by="system_record_id").reset_index(drop=True)
|
|
2166
2166
|
# actual_result_df.to_parquet(expected_result_path)
|
|
2167
|
+
actual_result_df["phone_num_a54a33"] = actual_result_df["phone_num_a54a33"].astype("Int64")
|
|
2168
|
+
actual_result_df["rep_date_f5d6bb"] = actual_result_df["rep_date_f5d6bb"].astype("Int64")
|
|
2167
2169
|
assert_frame_equal(actual_result_df, expected_result_df)
|
|
2168
2170
|
|
|
2169
2171
|
for i in range(5):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|