upgini 1.2.61__py3-none-any.whl → 1.2.62__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.61"
1
+ __version__ = "1.2.62"
upgini/dataset.py CHANGED
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
40
40
  from upgini.utils.target_utils import (
41
41
  balance_undersample,
42
42
  balance_undersample_forced,
43
- balance_undersample_time_series,
43
+ balance_undersample_time_series_trunc,
44
44
  )
45
45
 
46
46
  try:
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
58
58
  FIT_SAMPLE_THRESHOLD = 200_000
59
59
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
60
60
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
61
+ FIT_SAMPLE_THRESHOLD_TS = 54_000
62
+ FIT_SAMPLE_ROWS_TS = 54_000
61
63
  BINARY_MIN_SAMPLE_THRESHOLD = 5_000
62
64
  MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
63
65
  IMBALANCE_THESHOLD = 0.6
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
301
303
  )
302
304
 
303
305
  # Resample over fit threshold
304
- if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
306
+ if self.cv_type is not None and self.cv_type.is_time_series():
307
+ sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
308
+ sample_rows = self.FIT_SAMPLE_ROWS_TS
309
+ elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
305
310
  sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
306
311
  sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
307
312
  else:
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
314
319
  f"and will be downsampled to {sample_rows}"
315
320
  )
316
321
  if self.cv_type is not None and self.cv_type.is_time_series():
317
- resampled_data = balance_undersample_time_series(
322
+ resampled_data = balance_undersample_time_series_trunc(
318
323
  df=self.data,
319
324
  id_columns=self.id_columns,
320
325
  date_column=next(
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
584
589
  return search_customization
585
590
 
586
591
  def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
587
- if (
588
- runtime_parameters is not None
589
- and runtime_parameters.properties is not None
590
- ):
592
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
591
593
  if "generate_features" in runtime_parameters.properties:
592
594
  generate_features = runtime_parameters.properties["generate_features"].split(",")
593
595
  renamed_generate_features = []
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
607
609
 
608
610
  return runtime_parameters
609
611
 
612
+ def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
613
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
614
+ if self.cv_type is not None and self.cv_type.is_time_series():
615
+ runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
616
+ runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
617
+ return runtime_parameters
618
+
610
619
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
611
620
  if (
612
621
  runtime_parameters is not None
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
638
647
  file_metrics = FileMetrics()
639
648
 
640
649
  runtime_parameters = self._rename_generate_features(runtime_parameters)
650
+ runtime_parameters = self._set_sample_size(runtime_parameters)
641
651
 
642
652
  file_metadata = self.__construct_metadata(exclude_features_sources)
643
653
  search_customization = self.__construct_search_customization(
@@ -9,6 +9,7 @@ from upgini.errors import ValidationError
9
9
  from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
10
10
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
+ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
13
 
13
14
  TS_MIN_DIFFERENT_IDS_RATIO = 0.2
14
15
 
@@ -240,7 +241,7 @@ def balance_undersample_forced(
240
241
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
241
242
  if cv_type is not None and cv_type.is_time_series():
242
243
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
243
- resampled_data = balance_undersample_time_series(
244
+ resampled_data = balance_undersample_time_series_trunc(
244
245
  df,
245
246
  id_columns=id_columns,
246
247
  date_column=date_column,
@@ -279,6 +280,58 @@ def balance_undersample_forced(
279
280
  return resampled_data
280
281
 
281
282
 
283
+ DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
284
+ DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
285
+ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
286
+
287
+
288
+ def balance_undersample_time_series_trunc(
289
+ df: pd.DataFrame,
290
+ id_columns: List[str],
291
+ date_column: str,
292
+ sample_size: int,
293
+ random_state: int = 42,
294
+ logger: Optional[logging.Logger] = None,
295
+ highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
296
+ lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
297
+ time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
298
+ **kwargs,
299
+ ):
300
+ # Convert date column to datetime
301
+ dates_df = df[id_columns + [date_column]].copy()
302
+ dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
303
+
304
+ time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
305
+ if logger is not None:
306
+ logger.info(f"Time unit: {time_unit}")
307
+
308
+ if time_unit is None:
309
+ if logger is not None:
310
+ logger.info("Cannot detect time unit, returning original dataset")
311
+ return df
312
+
313
+ if time_unit < time_unit_threshold:
314
+ for trunc_length in highfreq_trunc_lengths:
315
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
316
+ if len(sampled_df) <= sample_size:
317
+ break
318
+ if len(sampled_df) > sample_size:
319
+ sampled_df = balance_undersample_time_series(
320
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
321
+ )
322
+ else:
323
+ for trunc_length in lowfreq_trunc_lengths:
324
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
325
+ if len(sampled_df) <= sample_size:
326
+ break
327
+ if len(sampled_df) > sample_size:
328
+ sampled_df = balance_undersample_time_series(
329
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
330
+ )
331
+
332
+ return df.loc[sampled_df.index]
333
+
334
+
282
335
  def balance_undersample_time_series(
283
336
  df: pd.DataFrame,
284
337
  id_columns: List[str],
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from typing import List, Optional
3
+ import pandas as pd
4
+
5
+
6
+ def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
7
+
8
+ def closest_unit(diff):
9
+ return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
10
+
11
+ all_diffs = []
12
+ groups = df.groupby(id_columns) if id_columns else [(None, df)]
13
+ for _, group in groups:
14
+ group_dates = group[date_column].sort_values().unique()
15
+ if len(group_dates) > 1:
16
+ diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
17
+ diff_ns = diff_series.dt.total_seconds()
18
+ all_diffs.extend(diff_ns)
19
+
20
+ all_diffs = pd.Series(all_diffs)
21
+
22
+ most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
23
+
24
+ return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
25
+
26
+
27
+ def trunc_datetime(
28
+ df: pd.DataFrame,
29
+ id_columns: List[str],
30
+ date_column: str,
31
+ length: pd.DateOffset,
32
+ logger: Optional[logging.Logger] = None,
33
+ ) -> pd.DataFrame:
34
+ if logger is not None:
35
+ logger.info(f"Truncating time series dataset to {length}")
36
+
37
+ if id_columns:
38
+ min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
39
+ else:
40
+ min_datetime = df[date_column].max() - length
41
+ return df[df[date_column] > min_datetime]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.61
3
+ Version: 1.2.62
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,7 +1,7 @@
1
- upgini/__about__.py,sha256=17s3XgKQ6UgMiFGNXwnQprj1EsjPUiE6QGnAzyDIfhs,23
1
+ upgini/__about__.py,sha256=X-PIyJPyy-W4DbKWDuHTMhmvRT8La2rsZ63Zaf_MERI,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
4
+ upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
6
  upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
@@ -58,10 +58,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
58
58
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
59
59
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
60
60
  upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
61
- upgini/utils/target_utils.py,sha256=VsMdlS04_9SHlB2DPfSWTeqjc2JoXR5OPvu4qmvkmkg,14347
61
+ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
62
62
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
63
+ upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
63
64
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
64
- upgini-1.2.61.dist-info/METADATA,sha256=hH2eL4JHq8BjVpY3ZNFYDqUtKs5psdoiVM5jiXjs0yU,49084
65
- upgini-1.2.61.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
66
- upgini-1.2.61.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
67
- upgini-1.2.61.dist-info/RECORD,,
65
+ upgini-1.2.62.dist-info/METADATA,sha256=l1TBHJEV26NNT_Er41bbO3ph5UZ-QkzYTpf_JU1Y7ak,49084
66
+ upgini-1.2.62.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
67
+ upgini-1.2.62.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
68
+ upgini-1.2.62.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any