upgini 1.2.61__tar.gz → 1.2.62a3818.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (71) hide show
  1. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/PKG-INFO +1 -1
  2. upgini-1.2.62a3818.dev1/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/all_operands.py +2 -2
  4. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/binary.py +1 -1
  5. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/date.py +1 -1
  6. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/feature.py +1 -1
  7. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/groupby.py +1 -1
  8. upgini-1.2.61/src/upgini/autofe/operand.py → upgini-1.2.62a3818.dev1/src/upgini/autofe/operator.py +2 -2
  9. upgini-1.2.61/src/upgini/autofe/vector.py → upgini-1.2.62a3818.dev1/src/upgini/autofe/timeseries.py +3 -23
  10. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/unary.py +1 -1
  11. upgini-1.2.62a3818.dev1/src/upgini/autofe/vector.py +24 -0
  12. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/dataset.py +17 -7
  13. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/features_enricher.py +1 -1
  14. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/target_utils.py +54 -1
  15. upgini-1.2.62a3818.dev1/src/upgini/utils/ts_utils.py +41 -0
  16. upgini-1.2.61/src/upgini/__about__.py +0 -1
  17. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/.gitignore +0 -0
  18. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/LICENSE +0 -0
  19. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/README.md +0 -0
  20. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/pyproject.toml +0 -0
  21. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/__init__.py +0 -0
  22. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/ads.py +0 -0
  23. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/ads_management/__init__.py +0 -0
  24. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
  25. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/autofe/__init__.py +0 -0
  26. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/errors.py +0 -0
  29. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/http.py +0 -0
  30. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/lazy_import.py +0 -0
  31. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/metadata.py +0 -0
  34. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/metrics.py +0 -0
  35. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
  37. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
  38. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
  39. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
  40. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  41. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/__init__.py +0 -0
  42. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/base.py +0 -0
  43. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
  44. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/sampler/utils.py +0 -0
  45. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/search_task.py +0 -0
  46. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/spinner.py +0 -0
  47. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  48. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/__init__.py +0 -0
  49. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
  50. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
  51. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/country_utils.py +0 -0
  52. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
  53. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/cv_utils.py +0 -0
  54. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/datetime_utils.py +0 -0
  55. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
  56. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/display_utils.py +0 -0
  57. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/email_utils.py +0 -0
  58. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  59. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/feature_info.py +0 -0
  60. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/features_validator.py +0 -0
  61. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/format.py +0 -0
  62. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/ip_utils.py +0 -0
  63. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/mstats.py +0 -0
  64. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/phone_utils.py +0 -0
  65. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
  66. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/progress_bar.py +0 -0
  67. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
  68. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/sort.py +0 -0
  69. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/track_info.py +0 -0
  70. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/utils/warning_counter.py +0 -0
  71. {upgini-1.2.61 → upgini-1.2.62a3818.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.61
3
+ Version: 1.2.62a3818.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.62a3818.dev1"
@@ -1,4 +1,4 @@
1
- from upgini.autofe.operand import OperandRegistry
1
+ from upgini.autofe.operator import OperatorRegistry
2
2
  from upgini.autofe.unary import * # noqa
3
3
  from upgini.autofe.binary import * # noqa
4
4
  from upgini.autofe.groupby import * # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
7
7
 
8
8
 
9
9
  def find_op(name):
10
- return OperandRegistry.get_operand(name)
10
+ return OperatorRegistry.get_operand(name)
@@ -5,7 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from jarowinkler import jarowinkler_similarity
7
7
 
8
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
8
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand
10
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
9
  from upgini.autofe.all_operands import find_op
10
- from upgini.autofe.operand import Operand, PandasOperand
10
+ from upgini.autofe.operator import Operand, PandasOperand
11
11
 
12
12
 
13
13
  class Column:
@@ -2,7 +2,7 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class GroupByThenAgg(
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class OperandRegistry(type(BaseModel)):
9
+ class OperatorRegistry(type(BaseModel)):
10
10
  _registry = {}
11
11
  _parametrized_registry = []
12
12
 
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
46
46
  return None
47
47
 
48
48
 
49
- class Operand(BaseModel, metaclass=OperandRegistry):
49
+ class Operand(BaseModel, metaclass=OperatorRegistry):
50
50
  name: Optional[str] = None
51
51
  alias: Optional[str] = None
52
52
  is_unary: bool = False
@@ -2,33 +2,13 @@ import abc
2
2
  from typing import Dict, List, Optional
3
3
 
4
4
  import pandas as pd
5
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand
5
6
 
6
7
  try:
7
8
  from pydantic import field_validator as validator # V2
8
9
  except ImportError:
9
10
  from pydantic import validator # V1
10
11
 
11
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
12
-
13
-
14
- class Mean(PandasOperand, VectorizableMixin):
15
- name: str = "mean"
16
- output_type: Optional[str] = "float"
17
- is_vector: bool = True
18
- group_index: int = 0
19
-
20
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
21
- return pd.DataFrame(data).T.fillna(0).mean(axis=1)
22
-
23
-
24
- class Sum(PandasOperand, VectorizableMixin):
25
- name: str = "sum"
26
- is_vector: bool = True
27
- group_index: int = 0
28
-
29
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
30
- return pd.DataFrame(data).T.fillna(0).sum(axis=1)
31
-
32
12
 
33
13
  class TimeSeriesBase(PandasOperand, abc.ABC):
34
14
  is_vector: bool = True
@@ -55,7 +35,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
55
35
  ts.set_index(date.name, inplace=True)
56
36
  ts = ts[ts.index.notna()].sort_index()
57
37
  ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
38
+ ts.groupby([c.name for c in data[1:-1]], group_keys=True)
59
39
  .apply(self._shift)[data[-1].name]
60
40
  .to_frame()
61
41
  .reset_index()
@@ -84,7 +64,7 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
84
64
  pass
85
65
 
86
66
 
87
- _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
67
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
88
68
 
89
69
 
90
70
  class Roll(TimeSeriesBase, ParametrizedOperand):
@@ -2,7 +2,7 @@ from typing import Dict, Optional
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Abs(PandasOperand, VectorizableMixin):
@@ -0,0 +1,24 @@
1
+ from typing import List, Optional
2
+
3
+ import pandas as pd
4
+
5
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
6
+
7
+
8
+ class Mean(PandasOperand, VectorizableMixin):
9
+ name: str = "mean"
10
+ output_type: Optional[str] = "float"
11
+ is_vector: bool = True
12
+ group_index: int = 0
13
+
14
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
+ return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
+
17
+
18
+ class Sum(PandasOperand, VectorizableMixin):
19
+ name: str = "sum"
20
+ is_vector: bool = True
21
+ group_index: int = 0
22
+
23
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
+ return pd.DataFrame(data).T.fillna(0).sum(axis=1)
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
40
40
  from upgini.utils.target_utils import (
41
41
  balance_undersample,
42
42
  balance_undersample_forced,
43
- balance_undersample_time_series,
43
+ balance_undersample_time_series_trunc,
44
44
  )
45
45
 
46
46
  try:
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
58
58
  FIT_SAMPLE_THRESHOLD = 200_000
59
59
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
60
60
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
61
+ FIT_SAMPLE_THRESHOLD_TS = 54_000
62
+ FIT_SAMPLE_ROWS_TS = 54_000
61
63
  BINARY_MIN_SAMPLE_THRESHOLD = 5_000
62
64
  MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
63
65
  IMBALANCE_THESHOLD = 0.6
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
301
303
  )
302
304
 
303
305
  # Resample over fit threshold
304
- if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
306
+ if self.cv_type is not None and self.cv_type.is_time_series():
307
+ sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
308
+ sample_rows = self.FIT_SAMPLE_ROWS_TS
309
+ elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
305
310
  sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
306
311
  sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
307
312
  else:
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
314
319
  f"and will be downsampled to {sample_rows}"
315
320
  )
316
321
  if self.cv_type is not None and self.cv_type.is_time_series():
317
- resampled_data = balance_undersample_time_series(
322
+ resampled_data = balance_undersample_time_series_trunc(
318
323
  df=self.data,
319
324
  id_columns=self.id_columns,
320
325
  date_column=next(
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
584
589
  return search_customization
585
590
 
586
591
  def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
587
- if (
588
- runtime_parameters is not None
589
- and runtime_parameters.properties is not None
590
- ):
592
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
591
593
  if "generate_features" in runtime_parameters.properties:
592
594
  generate_features = runtime_parameters.properties["generate_features"].split(",")
593
595
  renamed_generate_features = []
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
607
609
 
608
610
  return runtime_parameters
609
611
 
612
+ def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
613
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
614
+ if self.cv_type is not None and self.cv_type.is_time_series():
615
+ runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
616
+ runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
617
+ return runtime_parameters
618
+
610
619
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
611
620
  if (
612
621
  runtime_parameters is not None
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
638
647
  file_metrics = FileMetrics()
639
648
 
640
649
  runtime_parameters = self._rename_generate_features(runtime_parameters)
650
+ runtime_parameters = self._set_sample_size(runtime_parameters)
641
651
 
642
652
  file_metadata = self.__construct_metadata(exclude_features_sources)
643
653
  search_customization = self.__construct_search_customization(
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
31
31
  from sklearn.model_selection import BaseCrossValidator
32
32
 
33
33
  from upgini.autofe.feature import Feature
34
- from upgini.autofe.vector import TimeSeriesBase
34
+ from upgini.autofe.timeseries import TimeSeriesBase
35
35
  from upgini.data_source.data_source_publisher import CommercialSchema
36
36
  from upgini.dataset import Dataset
37
37
  from upgini.errors import HttpError, ValidationError
@@ -9,6 +9,7 @@ from upgini.errors import ValidationError
9
9
  from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
10
10
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
+ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
13
 
13
14
  TS_MIN_DIFFERENT_IDS_RATIO = 0.2
14
15
 
@@ -240,7 +241,7 @@ def balance_undersample_forced(
240
241
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
241
242
  if cv_type is not None and cv_type.is_time_series():
242
243
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
243
- resampled_data = balance_undersample_time_series(
244
+ resampled_data = balance_undersample_time_series_trunc(
244
245
  df,
245
246
  id_columns=id_columns,
246
247
  date_column=date_column,
@@ -279,6 +280,58 @@ def balance_undersample_forced(
279
280
  return resampled_data
280
281
 
281
282
 
283
+ DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
284
+ DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
285
+ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
286
+
287
+
288
+ def balance_undersample_time_series_trunc(
289
+ df: pd.DataFrame,
290
+ id_columns: List[str],
291
+ date_column: str,
292
+ sample_size: int,
293
+ random_state: int = 42,
294
+ logger: Optional[logging.Logger] = None,
295
+ highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
296
+ lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
297
+ time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
298
+ **kwargs,
299
+ ):
300
+ # Convert date column to datetime
301
+ dates_df = df[id_columns + [date_column]].copy()
302
+ dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
303
+
304
+ time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
305
+ if logger is not None:
306
+ logger.info(f"Time unit: {time_unit}")
307
+
308
+ if time_unit is None:
309
+ if logger is not None:
310
+ logger.info("Cannot detect time unit, returning original dataset")
311
+ return df
312
+
313
+ if time_unit < time_unit_threshold:
314
+ for trunc_length in highfreq_trunc_lengths:
315
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
316
+ if len(sampled_df) <= sample_size:
317
+ break
318
+ if len(sampled_df) > sample_size:
319
+ sampled_df = balance_undersample_time_series(
320
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
321
+ )
322
+ else:
323
+ for trunc_length in lowfreq_trunc_lengths:
324
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
325
+ if len(sampled_df) <= sample_size:
326
+ break
327
+ if len(sampled_df) > sample_size:
328
+ sampled_df = balance_undersample_time_series(
329
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
330
+ )
331
+
332
+ return df.loc[sampled_df.index]
333
+
334
+
282
335
  def balance_undersample_time_series(
283
336
  df: pd.DataFrame,
284
337
  id_columns: List[str],
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from typing import List, Optional
3
+ import pandas as pd
4
+
5
+
6
+ def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
7
+
8
+ def closest_unit(diff):
9
+ return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
10
+
11
+ all_diffs = []
12
+ groups = df.groupby(id_columns) if id_columns else [(None, df)]
13
+ for _, group in groups:
14
+ group_dates = group[date_column].sort_values().unique()
15
+ if len(group_dates) > 1:
16
+ diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
17
+ diff_ns = diff_series.dt.total_seconds()
18
+ all_diffs.extend(diff_ns)
19
+
20
+ all_diffs = pd.Series(all_diffs)
21
+
22
+ most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
23
+
24
+ return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
25
+
26
+
27
+ def trunc_datetime(
28
+ df: pd.DataFrame,
29
+ id_columns: List[str],
30
+ date_column: str,
31
+ length: pd.DateOffset,
32
+ logger: Optional[logging.Logger] = None,
33
+ ) -> pd.DataFrame:
34
+ if logger is not None:
35
+ logger.info(f"Truncating time series dataset to {length}")
36
+
37
+ if id_columns:
38
+ min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
39
+ else:
40
+ min_datetime = df[date_column].max() - length
41
+ return df[df[date_column] > min_datetime]
@@ -1 +0,0 @@
1
- __version__ = "1.2.61"
File without changes
File without changes
File without changes