upgini 1.2.59a3818.dev1__py3-none-any.whl → 1.2.60a3792.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/vector.py +1 -1
- upgini/data_source/data_source_publisher.py +1 -0
- upgini/dataset.py +32 -13
- upgini/features_enricher.py +34 -15
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/email_utils.py +6 -6
- upgini/utils/target_utils.py +54 -1
- upgini/utils/ts_utils.py +47 -0
- {upgini-1.2.59a3818.dev1.dist-info → upgini-1.2.60a3792.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.59a3818.dev1.dist-info → upgini-1.2.60a3792.dev1.dist-info}/RECORD +13 -12
- {upgini-1.2.59a3818.dev1.dist-info → upgini-1.2.60a3792.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.59a3818.dev1.dist-info → upgini-1.2.60a3792.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.60a3792.dev1"
|
upgini/autofe/vector.py
CHANGED
|
@@ -55,7 +55,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
|
55
55
|
ts.set_index(date.name, inplace=True)
|
|
56
56
|
ts = ts[ts.index.notna()].sort_index()
|
|
57
57
|
ts = (
|
|
58
|
-
ts.groupby([c.name for c in data[1:-1]]
|
|
58
|
+
ts.groupby([c.name for c in data[1:-1]])
|
|
59
59
|
.apply(self._shift)[data[-1].name]
|
|
60
60
|
.to_frame()
|
|
61
61
|
.reset_index()
|
|
@@ -386,6 +386,7 @@ class DataSourcePublisher:
|
|
|
386
386
|
search_keys = [k.value.value for k in search_keys] if search_keys else None
|
|
387
387
|
request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
|
|
388
388
|
task_id = self._rest_client.upload_online(request, trace_id)
|
|
389
|
+
print(f"Uploading online task created. task_id={task_id}")
|
|
389
390
|
with Spinner():
|
|
390
391
|
status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
|
|
391
392
|
while status_response["status"] not in self.FINAL_STATUSES:
|
upgini/dataset.py
CHANGED
|
@@ -41,6 +41,7 @@ from upgini.utils.target_utils import (
|
|
|
41
41
|
balance_undersample,
|
|
42
42
|
balance_undersample_forced,
|
|
43
43
|
balance_undersample_time_series,
|
|
44
|
+
balance_undersample_time_series_trunc,
|
|
44
45
|
)
|
|
45
46
|
|
|
46
47
|
try:
|
|
@@ -58,6 +59,8 @@ class Dataset: # (pd.DataFrame):
|
|
|
58
59
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
59
60
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
60
61
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
62
|
+
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
|
63
|
+
FIT_SAMPLE_ROWS_TS = 54_000
|
|
61
64
|
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
|
62
65
|
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
|
63
66
|
IMBALANCE_THESHOLD = 0.6
|
|
@@ -304,6 +307,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
304
307
|
if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
|
305
308
|
sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
|
306
309
|
sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
|
310
|
+
elif self.cv_type is not None and self.cv_type.is_time_series():
|
|
311
|
+
sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
|
|
312
|
+
sample_rows = self.FIT_SAMPLE_ROWS_TS
|
|
307
313
|
else:
|
|
308
314
|
sample_threshold = self.FIT_SAMPLE_THRESHOLD
|
|
309
315
|
sample_rows = self.FIT_SAMPLE_ROWS
|
|
@@ -314,7 +320,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
314
320
|
f"and will be downsampled to {sample_rows}"
|
|
315
321
|
)
|
|
316
322
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
317
|
-
resampled_data =
|
|
323
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
318
324
|
df=self.data,
|
|
319
325
|
id_columns=self.id_columns,
|
|
320
326
|
date_column=next(
|
|
@@ -584,19 +590,31 @@ class Dataset: # (pd.DataFrame):
|
|
|
584
590
|
return search_customization
|
|
585
591
|
|
|
586
592
|
def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
587
|
-
if
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
593
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
594
|
+
if "generate_features" in runtime_parameters.properties:
|
|
595
|
+
generate_features = runtime_parameters.properties["generate_features"].split(",")
|
|
596
|
+
renamed_generate_features = []
|
|
597
|
+
for f in generate_features:
|
|
598
|
+
for new_column, orig_column in self.columns_renaming.items():
|
|
599
|
+
if f == orig_column:
|
|
600
|
+
renamed_generate_features.append(new_column)
|
|
601
|
+
runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
|
|
602
|
+
if "columns_for_online_api" in runtime_parameters.properties:
|
|
603
|
+
columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
|
|
604
|
+
renamed_columns_for_online_api = []
|
|
605
|
+
for f in columns_for_online_api:
|
|
606
|
+
for new_column, orig_column in self.columns_renaming.items():
|
|
607
|
+
if f == orig_column:
|
|
608
|
+
renamed_columns_for_online_api.append(new_column)
|
|
609
|
+
runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
|
|
610
|
+
|
|
611
|
+
return runtime_parameters
|
|
599
612
|
|
|
613
|
+
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
614
|
+
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
|
615
|
+
if self.cv_type is not None and self.cv_type.is_time_series():
|
|
616
|
+
runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
617
|
+
runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
|
|
600
618
|
return runtime_parameters
|
|
601
619
|
|
|
602
620
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
|
@@ -630,6 +648,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
630
648
|
file_metrics = FileMetrics()
|
|
631
649
|
|
|
632
650
|
runtime_parameters = self._rename_generate_features(runtime_parameters)
|
|
651
|
+
runtime_parameters = self._set_sample_size(runtime_parameters)
|
|
633
652
|
|
|
634
653
|
file_metadata = self.__construct_metadata(exclude_features_sources)
|
|
635
654
|
search_customization = self.__construct_search_customization(
|
upgini/features_enricher.py
CHANGED
|
@@ -222,6 +222,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
222
222
|
loss: Optional[str] = None,
|
|
223
223
|
detect_missing_search_keys: bool = True,
|
|
224
224
|
generate_features: Optional[List[str]] = None,
|
|
225
|
+
columns_for_online_api: Optional[List[str]] = None,
|
|
225
226
|
round_embeddings: Optional[int] = None,
|
|
226
227
|
logs_enabled: bool = True,
|
|
227
228
|
raise_validation_error: bool = True,
|
|
@@ -345,6 +346,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
345
346
|
self.logger.error(msg)
|
|
346
347
|
raise ValidationError(msg)
|
|
347
348
|
self.runtime_parameters.properties["round_embeddings"] = round_embeddings
|
|
349
|
+
self.columns_for_online_api = columns_for_online_api
|
|
350
|
+
if columns_for_online_api is not None:
|
|
351
|
+
self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
|
|
348
352
|
maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
|
|
349
353
|
if maybe_downsampling_limit is not None:
|
|
350
354
|
Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
|
|
@@ -1873,13 +1877,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1873
1877
|
|
|
1874
1878
|
# downsample if need to eval_set threshold
|
|
1875
1879
|
num_samples = _num_samples(df)
|
|
1876
|
-
phone_column = self._get_phone_column(self.search_keys)
|
|
1877
1880
|
force_downsampling = (
|
|
1878
1881
|
not self.disable_force_downsampling
|
|
1879
|
-
and self.
|
|
1880
|
-
and phone_column is not None
|
|
1881
|
-
and self.fit_columns_renaming is not None
|
|
1882
|
-
and self.fit_columns_renaming.get(phone_column) in self.generate_features
|
|
1882
|
+
and self.columns_for_online_api is not None
|
|
1883
1883
|
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1884
1884
|
)
|
|
1885
1885
|
if force_downsampling:
|
|
@@ -1948,7 +1948,27 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1948
1948
|
df, _ = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
|
|
1949
1949
|
|
|
1950
1950
|
num_samples = _num_samples(df)
|
|
1951
|
-
|
|
1951
|
+
force_downsampling = (
|
|
1952
|
+
not self.disable_force_downsampling
|
|
1953
|
+
and self.columns_for_online_api is not None
|
|
1954
|
+
and num_samples > Dataset.FORCE_SAMPLE_SIZE
|
|
1955
|
+
)
|
|
1956
|
+
if force_downsampling:
|
|
1957
|
+
self.logger.info(f"Force downsampling from {num_samples} to {Dataset.FORCE_SAMPLE_SIZE}")
|
|
1958
|
+
df = balance_undersample_forced(
|
|
1959
|
+
df=df,
|
|
1960
|
+
target_column=TARGET,
|
|
1961
|
+
id_columns=self.id_columns,
|
|
1962
|
+
date_column=self._get_date_column(self.search_keys),
|
|
1963
|
+
task_type=self.model_task_type,
|
|
1964
|
+
cv_type=self.cv,
|
|
1965
|
+
random_state=self.random_state,
|
|
1966
|
+
sample_size=Dataset.FORCE_SAMPLE_SIZE,
|
|
1967
|
+
logger=self.logger,
|
|
1968
|
+
bundle=self.bundle,
|
|
1969
|
+
warning_callback=self.__log_warning,
|
|
1970
|
+
)
|
|
1971
|
+
elif num_samples > Dataset.FIT_SAMPLE_THRESHOLD:
|
|
1952
1972
|
self.logger.info(f"Downsampling from {num_samples} to {Dataset.FIT_SAMPLE_ROWS}")
|
|
1953
1973
|
df = df.sample(n=Dataset.FIT_SAMPLE_ROWS, random_state=self.random_state)
|
|
1954
1974
|
|
|
@@ -2620,17 +2640,18 @@ if response.status_code == 200:
|
|
|
2620
2640
|
checked_generate_features = []
|
|
2621
2641
|
for gen_feature in self.generate_features:
|
|
2622
2642
|
if gen_feature not in x_columns:
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2626
|
-
)
|
|
2627
|
-
else:
|
|
2628
|
-
self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
|
|
2643
|
+
msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
|
|
2644
|
+
self.__log_warning(msg)
|
|
2629
2645
|
else:
|
|
2630
2646
|
checked_generate_features.append(gen_feature)
|
|
2631
2647
|
self.generate_features = checked_generate_features
|
|
2632
2648
|
self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
|
|
2633
2649
|
|
|
2650
|
+
if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
|
|
2651
|
+
for column in self.columns_for_online_api:
|
|
2652
|
+
if column not in validated_X.columns:
|
|
2653
|
+
raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
|
|
2654
|
+
|
|
2634
2655
|
if self.id_columns is not None:
|
|
2635
2656
|
for id_column in self.id_columns:
|
|
2636
2657
|
if id_column not in validated_X.columns:
|
|
@@ -2852,9 +2873,7 @@ if response.status_code == 200:
|
|
|
2852
2873
|
# Force downsampling to 7000 for API features generation
|
|
2853
2874
|
force_downsampling = (
|
|
2854
2875
|
not self.disable_force_downsampling
|
|
2855
|
-
and self.
|
|
2856
|
-
and phone_column is not None
|
|
2857
|
-
and self.fit_columns_renaming[phone_column] in self.generate_features
|
|
2876
|
+
and self.columns_for_online_api is not None
|
|
2858
2877
|
and len(df) > Dataset.FORCE_SAMPLE_SIZE
|
|
2859
2878
|
)
|
|
2860
2879
|
if force_downsampling:
|
|
@@ -111,6 +111,7 @@ x_is_empty=X is empty
|
|
|
111
111
|
y_is_empty=y is empty
|
|
112
112
|
x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
|
|
113
113
|
missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
|
|
114
|
+
missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
|
|
114
115
|
x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
|
|
115
116
|
train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
|
|
116
117
|
eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
|
upgini/utils/email_utils.py
CHANGED
|
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
|
|
|
116
116
|
else:
|
|
117
117
|
df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
|
|
118
118
|
|
|
119
|
-
del self.search_keys[self.email_column]
|
|
120
|
-
if self.email_column in self.unnest_search_keys:
|
|
121
|
-
|
|
119
|
+
# del self.search_keys[self.email_column]
|
|
120
|
+
# if self.email_column in self.unnest_search_keys:
|
|
121
|
+
# self.unnest_search_keys.remove(self.email_column)
|
|
122
122
|
|
|
123
123
|
one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
|
|
124
124
|
df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
|
|
125
125
|
self.columns_renaming[one_domain_name] = original_email_column
|
|
126
126
|
self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
|
|
127
127
|
|
|
128
|
-
if self.email_converted_to_hem:
|
|
129
|
-
|
|
130
|
-
|
|
128
|
+
# if self.email_converted_to_hem:
|
|
129
|
+
# df = df.drop(columns=self.email_column)
|
|
130
|
+
# del self.columns_renaming[self.email_column]
|
|
131
131
|
|
|
132
132
|
return df
|
upgini/utils/target_utils.py
CHANGED
|
@@ -10,6 +10,7 @@ from upgini.errors import ValidationError
|
|
|
10
10
|
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
|
11
11
|
from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
|
|
12
12
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
13
|
+
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
|
13
14
|
|
|
14
15
|
TS_MIN_DIFFERENT_IDS_RATIO = 0.2
|
|
15
16
|
|
|
@@ -241,7 +242,7 @@ def balance_undersample_forced(
|
|
|
241
242
|
df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
|
|
242
243
|
if cv_type is not None and cv_type.is_time_series():
|
|
243
244
|
logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
|
|
244
|
-
resampled_data =
|
|
245
|
+
resampled_data = balance_undersample_time_series_trunc(
|
|
245
246
|
df,
|
|
246
247
|
id_columns=id_columns,
|
|
247
248
|
date_column=date_column,
|
|
@@ -280,6 +281,58 @@ def balance_undersample_forced(
|
|
|
280
281
|
return resampled_data
|
|
281
282
|
|
|
282
283
|
|
|
284
|
+
DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
|
|
285
|
+
DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
|
|
286
|
+
DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def balance_undersample_time_series_trunc(
|
|
290
|
+
df: pd.DataFrame,
|
|
291
|
+
id_columns: List[str],
|
|
292
|
+
date_column: str,
|
|
293
|
+
sample_size: int,
|
|
294
|
+
random_state: int = 42,
|
|
295
|
+
logger: Optional[logging.Logger] = None,
|
|
296
|
+
highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
|
|
297
|
+
lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
|
|
298
|
+
time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
|
|
299
|
+
**kwargs,
|
|
300
|
+
):
|
|
301
|
+
# Convert date column to datetime
|
|
302
|
+
dates_df = df[id_columns + [date_column]].copy()
|
|
303
|
+
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
|
304
|
+
|
|
305
|
+
time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
|
|
306
|
+
if logger is not None:
|
|
307
|
+
logger.info(f"Time unit: {time_unit}")
|
|
308
|
+
|
|
309
|
+
if time_unit is None:
|
|
310
|
+
if logger is not None:
|
|
311
|
+
logger.info("Cannot detect time unit, returning original dataset")
|
|
312
|
+
return df
|
|
313
|
+
|
|
314
|
+
if time_unit < time_unit_threshold:
|
|
315
|
+
for trunc_length in highfreq_trunc_lengths:
|
|
316
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length)
|
|
317
|
+
if len(sampled_df) <= sample_size:
|
|
318
|
+
break
|
|
319
|
+
if len(sampled_df) > sample_size:
|
|
320
|
+
sampled_df = balance_undersample_time_series(
|
|
321
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
322
|
+
)
|
|
323
|
+
else:
|
|
324
|
+
for trunc_length in lowfreq_trunc_lengths:
|
|
325
|
+
sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length)
|
|
326
|
+
if len(sampled_df) <= sample_size:
|
|
327
|
+
break
|
|
328
|
+
if len(sampled_df) > sample_size:
|
|
329
|
+
sampled_df = balance_undersample_time_series(
|
|
330
|
+
sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
return df.loc[sampled_df.index]
|
|
334
|
+
|
|
335
|
+
|
|
283
336
|
def balance_undersample_time_series(
|
|
284
337
|
df: pd.DataFrame,
|
|
285
338
|
id_columns: List[str],
|
upgini/utils/ts_utils.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Optional
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
|
|
7
|
+
|
|
8
|
+
def closest_unit(diff):
|
|
9
|
+
return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
|
|
10
|
+
|
|
11
|
+
# Calculate differences for each ID group
|
|
12
|
+
all_diffs = []
|
|
13
|
+
groups = df.groupby(id_columns) if id_columns else [(None, df)]
|
|
14
|
+
for _, group in groups:
|
|
15
|
+
# Get sorted dates for this group
|
|
16
|
+
group_dates = group[date_column].sort_values().unique()
|
|
17
|
+
if len(group_dates) > 1:
|
|
18
|
+
# Calculate time differences between consecutive dates
|
|
19
|
+
diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
|
|
20
|
+
# Convert to nanoseconds
|
|
21
|
+
diff_ns = diff_series.dt.total_seconds()
|
|
22
|
+
all_diffs.extend(diff_ns)
|
|
23
|
+
|
|
24
|
+
# Convert to series for easier processing
|
|
25
|
+
all_diffs = pd.Series(all_diffs)
|
|
26
|
+
|
|
27
|
+
# Get most common time unit across all groups
|
|
28
|
+
most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
|
|
29
|
+
|
|
30
|
+
return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def trunc_datetime(
|
|
34
|
+
df: pd.DataFrame,
|
|
35
|
+
id_columns: List[str],
|
|
36
|
+
date_column: str,
|
|
37
|
+
length: pd.DateOffset,
|
|
38
|
+
logger: Optional[logging.Logger] = None,
|
|
39
|
+
) -> pd.DataFrame:
|
|
40
|
+
if logger is not None:
|
|
41
|
+
logger.info(f"Truncating time series dataset to {length}")
|
|
42
|
+
|
|
43
|
+
if id_columns:
|
|
44
|
+
min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
|
|
45
|
+
else:
|
|
46
|
+
min_datetime = df[date_column].max() - length
|
|
47
|
+
return df[df[date_column] > min_datetime]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.60a3792.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=yYjoHiqKj96yFzYqXlsnJPzF_FcgZvyGwKBQjTVsNi4,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=iSZX4KiDJlJFukNnAzBgkuT3UqbS-pyOyJlVXwTyaU0,34993
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=IXU6ahvQqMGLdZsrHCjOGEia1pBAgixfld3pNVPcGEM,202468
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
@@ -21,16 +21,16 @@ upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,1474
|
|
|
21
21
|
upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
|
|
22
22
|
upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
|
|
23
23
|
upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
|
|
24
|
-
upgini/autofe/vector.py,sha256=
|
|
24
|
+
upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
|
|
25
25
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
26
|
-
upgini/data_source/data_source_publisher.py,sha256=
|
|
26
|
+
upgini/data_source/data_source_publisher.py,sha256=0vaYz5v3KclJnA6jAWiTUiMQO5mbBTBINWV9jr2F5xM,22591
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=UXMiaFP3p-WdiXyZJN3O_OZstb-F33BWVDxDiofyxd4,27464
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -46,7 +46,7 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
|
46
46
|
upgini/utils/datetime_utils.py,sha256=RVAk4_rakK8X9zjybK3-rj0to0e3elye8tnBuA4wTWU,13491
|
|
47
47
|
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
48
48
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
49
|
-
upgini/utils/email_utils.py,sha256=
|
|
49
|
+
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
50
50
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
51
51
|
upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
|
|
52
52
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
@@ -56,10 +56,11 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
|
|
|
56
56
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
57
57
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
58
58
|
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
59
|
-
upgini/utils/target_utils.py,sha256=
|
|
59
|
+
upgini/utils/target_utils.py,sha256=a7Ck7WgQeUhDrnluOdFXvOdX6zDL-4Wiqt_f4jZxHag,16543
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
|
+
upgini/utils/ts_utils.py,sha256=_YbNVE144vtEPlvLpvPGguDNzrnUM9IIjdX2VQz4T7E,1671
|
|
61
62
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
63
|
+
upgini-1.2.60a3792.dev1.dist-info/METADATA,sha256=4k4LdGfGvuhNHhpT83pomgnfvZr8x2fKQDQbFCEAyPA,49065
|
|
64
|
+
upgini-1.2.60a3792.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
65
|
+
upgini-1.2.60a3792.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
66
|
+
upgini-1.2.60a3792.dev1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|