upgini 1.2.61__py3-none-any.whl → 1.2.62__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +17 -7
- upgini/utils/target_utils.py +54 -1
- upgini/utils/ts_utils.py +41 -0
- {upgini-1.2.61.dist-info → upgini-1.2.62.dist-info}/METADATA +1 -1
- {upgini-1.2.61.dist-info → upgini-1.2.62.dist-info}/RECORD +8 -7
- {upgini-1.2.61.dist-info → upgini-1.2.62.dist-info}/WHEEL +1 -1
- {upgini-1.2.61.dist-info → upgini-1.2.62.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.62"
|
upgini/dataset.py
CHANGED
|
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
|
|
|
40
40
|
from upgini.utils.target_utils import (
|
|
41
41
|
balance_undersample,
|
|
42
42
|
balance_undersample_forced,
|
|
43
|
-
|
|
43
|
+
balance_undersample_time_series_trunc,
|
|
44
44
|
)
|
|
45
45
|
|
|
46
46
|
try:
|
|
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
58
58
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
59
59
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
60
60
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
61
|
+
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
|
62
|
+
FIT_SAMPLE_ROWS_TS = 54_000
|
|
61
63
|
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
62
64
|
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
63
65
|
IMBALANCE_THESHOLD = 0.6
|
|
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
301
303
|
)
|
|
302
304
|
|
|
303
305
|
# Resample over fit threshold
|
|
304
|
-
if
|
|
306
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
307
|
+
sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
|
|
308
|
+
sample_rows = self.FIT_SAMPLE_ROWS_TS
|
|
309
|
+
elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
|
305
310
|
sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
306
311
|
sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
307
312
|
else:
|
|
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
314
319
|
f"and will be downsampled to {sample_rows}"
|
|
315
320
|
)
|
|
316
321
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
317
|
-
resampled_data =
|
|
322
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
318
323
|
df=self.data,
|
|
319
324
|
id_columns=self.id_columns,
|
|
320
325
|
date_column=next(
|
|
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
584
589
|
return search_customization
|
|
585
590
|
|
|
586
591
|
def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
587
|
-
if
|
|
588
|
-
runtime_parameters is not None
|
|
589
|
-
and runtime_parameters.properties is not None
|
|
590
|
-
):
|
|
592
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
591
593
|
if "generate_features" in runtime_parameters.properties:
|
|
592
594
|
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
593
595
|
renamed_generate_features = []
|
|
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
|
|
|
607
609
|
|
|
608
610
|
return runtime_parameters
|
|
609
611
|
|
|
612
|
+
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
613
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
614
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
615
|
+
runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
616
|
+
runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
617
|
+
return runtime_parameters
|
|
618
|
+
|
|
610
619
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
611
620
|
if (
|
|
612
621
|
runtime_parameters is not None
|
|
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
638
647
|
file_metrics = FileMetrics()
|
|
639
648
|
|
|
640
649
|
runtime_parameters = self._rename_generate_features(runtime_parameters)
|
|
650
|
+
runtime_parameters = self._set_sample_size(runtime_parameters)
|
|
641
651
|
|
|
642
652
|
file_metadata = self.__construct_metadata(exclude_features_sources)
|
|
643
653
|
search_customization = self.__construct_search_customization(
|
upgini/utils/target_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ from upgini.errors import ValidationError
|
|
|
9
9
|
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
10
10
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
11
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
12
|
+
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
|
12
13
|
|
|
13
14
|
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
14
15
|
|
|
@@ -240,7 +241,7 @@ def balance_undersample_forced(
|
|
|
240
241
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
241
242
|
if cv_type is not None and cv_type.is_time_series():
|
|
242
243
|
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
243
|
-
resampled_data =
|
|
244
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
244
245
|
df,
|
|
245
246
|
id_columns=id_columns,
|
|
246
247
|
date_column=date_column,
|
|
@@ -279,6 +280,58 @@ def balance_undersample_forced(
|
|
|
279
280
|
return resampled_data
|
|
280
281
|
|
|
281
282
|
|
|
283
|
+
DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
|
284
|
+
DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
|
285
|
+
DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def balance_undersample_time_series_trunc(
|
|
289
|
+
df: pd.DataFrame,
|
|
290
|
+
id_columns: List[str],
|
|
291
|
+
date_column: str,
|
|
292
|
+
sample_size: int,
|
|
293
|
+
random_state: int = 42,
|
|
294
|
+
logger: Optional[logging.Logger] = None,
|
|
295
|
+
highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
|
|
296
|
+
lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
|
|
297
|
+
time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
|
|
298
|
+
**kwargs,
|
|
299
|
+
):
|
|
300
|
+
# Convert date column to datetime
|
|
301
|
+
dates_df = df[id_columns + [date_column]].copy()
|
|
302
|
+
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
|
303
|
+
|
|
304
|
+
time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
|
|
305
|
+
if logger is not None:
|
|
306
|
+
logger.info(f"Time unit: {time_unit}")
|
|
307
|
+
|
|
308
|
+
if time_unit is None:
|
|
309
|
+
if logger is not None:
|
|
310
|
+
logger.info("Cannot detect time unit, returning original dataset")
|
|
311
|
+
return df
|
|
312
|
+
|
|
313
|
+
if time_unit < time_unit_threshold:
|
|
314
|
+
for trunc_length in highfreq_trunc_lengths:
|
|
315
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
|
|
316
|
+
if len(sampled_df) <= sample_size:
|
|
317
|
+
break
|
|
318
|
+
if len(sampled_df) > sample_size:
|
|
319
|
+
sampled_df = balance_undersample_time_series(
|
|
320
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
321
|
+
)
|
|
322
|
+
else:
|
|
323
|
+
for trunc_length in lowfreq_trunc_lengths:
|
|
324
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
|
|
325
|
+
if len(sampled_df) <= sample_size:
|
|
326
|
+
break
|
|
327
|
+
if len(sampled_df) > sample_size:
|
|
328
|
+
sampled_df = balance_undersample_time_series(
|
|
329
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return df.loc[sampled_df.index]
|
|
333
|
+
|
|
334
|
+
|
|
282
335
|
def balance_undersample_time_series(
|
|
283
336
|
df: pd.DataFrame,
|
|
284
337
|
id_columns: List[str],
|
upgini/utils/ts_utils.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
|
|
7
|
+
|
|
8
|
+
def closest_unit(diff):
|
|
9
|
+
return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
|
|
10
|
+
|
|
11
|
+
all_diffs = []
|
|
12
|
+
groups = df.groupby(id_columns) if id_columns else [(None, df)]
|
|
13
|
+
for _, group in groups:
|
|
14
|
+
group_dates = group[date_column].sort_values().unique()
|
|
15
|
+
if len(group_dates) > 1:
|
|
16
|
+
diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
|
|
17
|
+
diff_ns = diff_series.dt.total_seconds()
|
|
18
|
+
all_diffs.extend(diff_ns)
|
|
19
|
+
|
|
20
|
+
all_diffs = pd.Series(all_diffs)
|
|
21
|
+
|
|
22
|
+
most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
|
|
23
|
+
|
|
24
|
+
return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def trunc_datetime(
|
|
28
|
+
df: pd.DataFrame,
|
|
29
|
+
id_columns: List[str],
|
|
30
|
+
date_column: str,
|
|
31
|
+
length: pd.DateOffset,
|
|
32
|
+
logger: Optional[logging.Logger] = None,
|
|
33
|
+
) -> pd.DataFrame:
|
|
34
|
+
if logger is not None:
|
|
35
|
+
logger.info(f"Truncating time series dataset to {length}")
|
|
36
|
+
|
|
37
|
+
if id_columns:
|
|
38
|
+
min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
|
|
39
|
+
else:
|
|
40
|
+
min_datetime = df[date_column].max() - length
|
|
41
|
+
return df[df[date_column] > min_datetime]
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=X-PIyJPyy-W4DbKWDuHTMhmvRT8La2rsZ63Zaf_MERI,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
6
|
upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
@@ -58,10 +58,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
|
58
58
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
59
59
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
60
60
|
upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
|
|
61
|
-
upgini/utils/target_utils.py,sha256=
|
|
61
|
+
upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
|
|
62
62
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
63
|
+
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
63
64
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
66
|
-
upgini-1.2.
|
|
67
|
-
upgini-1.2.
|
|
65
|
+
upgini-1.2.62.dist-info/METADATA,sha256=l1TBHJEV26NNT_Er41bbO3ph5UZ-QkzYTpf_JU1Y7ak,49084
|
|
66
|
+
upgini-1.2.62.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
67
|
+
upgini-1.2.62.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
68
|
+
upgini-1.2.62.dist-info/RECORD,,
|
|
File without changes
|