upgini 1.2.61__py3-none-any.whl → 1.2.62a3818.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.61"
1
+ __version__ = "1.2.62a3818.dev1"
@@ -1,4 +1,4 @@
1
- from upgini.autofe.operand import OperandRegistry
1
+ from upgini.autofe.operator import OperatorRegistry
2
2
  from upgini.autofe.unary import * # noqa
3
3
  from upgini.autofe.binary import * # noqa
4
4
  from upgini.autofe.groupby import * # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
7
7
 
8
8
 
9
9
  def find_op(name):
10
- return OperandRegistry.get_operand(name)
10
+ return OperatorRegistry.get_operand(name)
upgini/autofe/binary.py CHANGED
@@ -5,7 +5,7 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from jarowinkler import jarowinkler_similarity
7
7
 
8
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
8
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
11
  class Min(PandasOperand):
upgini/autofe/date.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand
10
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
upgini/autofe/feature.py CHANGED
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
9
  from upgini.autofe.all_operands import find_op
10
- from upgini.autofe.operand import Operand, PandasOperand
10
+ from upgini.autofe.operator import Operand, PandasOperand
11
11
 
12
12
 
13
13
  class Column:
upgini/autofe/groupby.py CHANGED
@@ -2,7 +2,7 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class GroupByThenAgg(
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class OperandRegistry(type(BaseModel)):
9
+ class OperatorRegistry(type(BaseModel)):
10
10
  _registry = {}
11
11
  _parametrized_registry = []
12
12
 
@@ -46,7 +46,7 @@ class OperandRegistry(type(BaseModel)):
46
46
  return None
47
47
 
48
48
 
49
- class Operand(BaseModel, metaclass=OperandRegistry):
49
+ class Operand(BaseModel, metaclass=OperatorRegistry):
50
50
  name: Optional[str] = None
51
51
  alias: Optional[str] = None
52
52
  is_unary: bool = False
@@ -0,0 +1,200 @@
1
+ import abc
2
+ from typing import Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from upgini.autofe.operator import PandasOperand, ParametrizedOperand
6
+
7
+ try:
8
+ from pydantic import field_validator as validator # V2
9
+ except ImportError:
10
+ from pydantic import validator # V1
11
+
12
+
13
+ class TimeSeriesBase(PandasOperand, abc.ABC):
14
+ is_vector: bool = True
15
+ date_unit: Optional[str] = None
16
+ offset_size: int = 0
17
+ offset_unit: str = "D"
18
+
19
+ def get_params(self) -> Dict[str, Optional[str]]:
20
+ res = super().get_params()
21
+ res.update(
22
+ {
23
+ "date_unit": self.date_unit,
24
+ "offset_size": self.offset_size,
25
+ "offset_unit": self.offset_unit,
26
+ }
27
+ )
28
+ return res
29
+
30
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
31
+ # assuming first is date, last is value, rest is group columns
32
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
33
+ ts = pd.concat([date] + data[1:], axis=1)
34
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
35
+ ts.set_index(date.name, inplace=True)
36
+ ts = ts[ts.index.notna()].sort_index()
37
+ ts = (
38
+ ts.groupby([c.name for c in data[1:-1]], group_keys=True)
39
+ .apply(self._shift)[data[-1].name]
40
+ .to_frame()
41
+ .reset_index()
42
+ .set_index(date.name)
43
+ .groupby([c.name for c in data[1:-1]])
44
+ if len(data) > 2
45
+ else self._shift(ts)
46
+ )
47
+ ts = self._aggregate(ts)
48
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
49
+ ts.index = date.index
50
+
51
+ return ts.iloc[:, -1]
52
+
53
+ def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
54
+ if self.offset_size > 0:
55
+ return ts.iloc[:, :-1].merge(
56
+ ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
57
+ left_index=True,
58
+ right_index=True,
59
+ )
60
+ return ts
61
+
62
+ @abc.abstractmethod
63
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
64
+ pass
65
+
66
+
67
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
68
+
69
+
70
+ class Roll(TimeSeriesBase, ParametrizedOperand):
71
+ aggregation: str
72
+ window_size: int = 1
73
+ window_unit: str = "D"
74
+
75
+ @validator("window_unit")
76
+ @classmethod
77
+ def validate_window_unit(cls, v: str) -> str:
78
+ try:
79
+ pd.tseries.frequencies.to_offset(v)
80
+ return v
81
+ except ValueError:
82
+ raise ValueError(
83
+ f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
84
+ )
85
+
86
+ def to_formula(self) -> str:
87
+ roll_component = f"roll_{self.window_size}{self.window_unit}"
88
+ if self.offset_size > 0:
89
+ roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
90
+ return f"{roll_component}_{self.aggregation}"
91
+
92
+ @classmethod
93
+ def from_formula(cls, formula: str) -> Optional["Roll"]:
94
+ import re
95
+
96
+ # Try matching pattern with offset first
97
+ pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
98
+ match_with_offset = re.match(pattern_with_offset, formula)
99
+
100
+ if match_with_offset:
101
+ window_size = int(match_with_offset.group(1))
102
+ window_unit = match_with_offset.group(2)
103
+ offset_size = int(match_with_offset.group(3))
104
+ offset_unit = match_with_offset.group(4)
105
+ aggregation = match_with_offset.group(5)
106
+
107
+ return cls(
108
+ window_size=window_size,
109
+ window_unit=window_unit,
110
+ offset_size=offset_size,
111
+ offset_unit=offset_unit,
112
+ aggregation=aggregation,
113
+ )
114
+
115
+ # If no offset pattern found, try basic pattern
116
+ pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
117
+ match = re.match(pattern, formula)
118
+
119
+ if not match:
120
+ return None
121
+
122
+ window_size = int(match.group(1))
123
+ window_unit = match.group(2)
124
+ aggregation = match.group(3)
125
+
126
+ return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
127
+
128
+ def get_params(self) -> Dict[str, Optional[str]]:
129
+ res = super().get_params()
130
+ res.update(
131
+ {
132
+ "window_size": self.window_size,
133
+ "window_unit": self.window_unit,
134
+ "aggregation": self.aggregation,
135
+ }
136
+ )
137
+ return res
138
+
139
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
140
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
141
+ _roll_aggregations.get(self.aggregation, self.aggregation)
142
+ )
143
+
144
+
145
+ class Lag(TimeSeriesBase, ParametrizedOperand):
146
+ lag_size: int
147
+ lag_unit: str = "D"
148
+
149
+ def to_formula(self) -> str:
150
+ lag_component = f"lag_{self.lag_size}{self.lag_unit}"
151
+ if self.offset_size > 0:
152
+ lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
153
+ return lag_component
154
+
155
+ @classmethod
156
+ def from_formula(cls, formula: str) -> Optional["Lag"]:
157
+ import re
158
+
159
+ # Try matching pattern with offset first
160
+ pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
161
+ match_with_offset = re.match(pattern_with_offset, formula)
162
+
163
+ if match_with_offset:
164
+ lag_size = int(match_with_offset.group(1))
165
+ lag_unit = match_with_offset.group(2)
166
+ offset_size = int(match_with_offset.group(3))
167
+ offset_unit = match_with_offset.group(4)
168
+
169
+ return cls(
170
+ lag_size=lag_size,
171
+ lag_unit=lag_unit,
172
+ offset_size=offset_size,
173
+ offset_unit=offset_unit,
174
+ )
175
+
176
+ # If no offset pattern found, try basic pattern
177
+ pattern = r"^lag_(\d+)([a-zA-Z])$"
178
+ match = re.match(pattern, formula)
179
+
180
+ if not match:
181
+ return None
182
+
183
+ lag_size = int(match.group(1))
184
+ lag_unit = match.group(2)
185
+
186
+ return cls(lag_size=lag_size, lag_unit=lag_unit)
187
+
188
+ def get_params(self) -> Dict[str, Optional[str]]:
189
+ res = super().get_params()
190
+ res.update(
191
+ {
192
+ "lag_size": self.lag_size,
193
+ "lag_unit": self.lag_unit,
194
+ }
195
+ )
196
+ return res
197
+
198
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
199
+ lag_window = self.lag_size + 1
200
+ return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
upgini/autofe/unary.py CHANGED
@@ -2,7 +2,7 @@ from typing import Dict, Optional
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class Abs(PandasOperand, VectorizableMixin):
upgini/autofe/vector.py CHANGED
@@ -1,14 +1,8 @@
1
- import abc
2
- from typing import Dict, List, Optional
1
+ from typing import List, Optional
3
2
 
4
3
  import pandas as pd
5
4
 
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
10
-
11
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperand, VectorizableMixin
12
6
 
13
7
 
14
8
  class Mean(PandasOperand, VectorizableMixin):
@@ -28,193 +22,3 @@ class Sum(PandasOperand, VectorizableMixin):
28
22
 
29
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
30
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
31
-
32
-
33
- class TimeSeriesBase(PandasOperand, abc.ABC):
34
- is_vector: bool = True
35
- date_unit: Optional[str] = None
36
- offset_size: int = 0
37
- offset_unit: str = "D"
38
-
39
- def get_params(self) -> Dict[str, Optional[str]]:
40
- res = super().get_params()
41
- res.update(
42
- {
43
- "date_unit": self.date_unit,
44
- "offset_size": self.offset_size,
45
- "offset_unit": self.offset_unit,
46
- }
47
- )
48
- return res
49
-
50
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
51
- # assuming first is date, last is value, rest is group columns
52
- date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
53
- ts = pd.concat([date] + data[1:], axis=1)
54
- ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
55
- ts.set_index(date.name, inplace=True)
56
- ts = ts[ts.index.notna()].sort_index()
57
- ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
59
- .apply(self._shift)[data[-1].name]
60
- .to_frame()
61
- .reset_index()
62
- .set_index(date.name)
63
- .groupby([c.name for c in data[1:-1]])
64
- if len(data) > 2
65
- else self._shift(ts)
66
- )
67
- ts = self._aggregate(ts)
68
- ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
69
- ts.index = date.index
70
-
71
- return ts.iloc[:, -1]
72
-
73
- def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
74
- if self.offset_size > 0:
75
- return ts.iloc[:, :-1].merge(
76
- ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
77
- left_index=True,
78
- right_index=True,
79
- )
80
- return ts
81
-
82
- @abc.abstractmethod
83
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
84
- pass
85
-
86
-
87
- _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
88
-
89
-
90
- class Roll(TimeSeriesBase, ParametrizedOperand):
91
- aggregation: str
92
- window_size: int = 1
93
- window_unit: str = "D"
94
-
95
- @validator("window_unit")
96
- @classmethod
97
- def validate_window_unit(cls, v: str) -> str:
98
- try:
99
- pd.tseries.frequencies.to_offset(v)
100
- return v
101
- except ValueError:
102
- raise ValueError(
103
- f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
104
- )
105
-
106
- def to_formula(self) -> str:
107
- roll_component = f"roll_{self.window_size}{self.window_unit}"
108
- if self.offset_size > 0:
109
- roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
110
- return f"{roll_component}_{self.aggregation}"
111
-
112
- @classmethod
113
- def from_formula(cls, formula: str) -> Optional["Roll"]:
114
- import re
115
-
116
- # Try matching pattern with offset first
117
- pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
118
- match_with_offset = re.match(pattern_with_offset, formula)
119
-
120
- if match_with_offset:
121
- window_size = int(match_with_offset.group(1))
122
- window_unit = match_with_offset.group(2)
123
- offset_size = int(match_with_offset.group(3))
124
- offset_unit = match_with_offset.group(4)
125
- aggregation = match_with_offset.group(5)
126
-
127
- return cls(
128
- window_size=window_size,
129
- window_unit=window_unit,
130
- offset_size=offset_size,
131
- offset_unit=offset_unit,
132
- aggregation=aggregation,
133
- )
134
-
135
- # If no offset pattern found, try basic pattern
136
- pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
137
- match = re.match(pattern, formula)
138
-
139
- if not match:
140
- return None
141
-
142
- window_size = int(match.group(1))
143
- window_unit = match.group(2)
144
- aggregation = match.group(3)
145
-
146
- return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
147
-
148
- def get_params(self) -> Dict[str, Optional[str]]:
149
- res = super().get_params()
150
- res.update(
151
- {
152
- "window_size": self.window_size,
153
- "window_unit": self.window_unit,
154
- "aggregation": self.aggregation,
155
- }
156
- )
157
- return res
158
-
159
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
160
- return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
161
- _roll_aggregations.get(self.aggregation, self.aggregation)
162
- )
163
-
164
-
165
- class Lag(TimeSeriesBase, ParametrizedOperand):
166
- lag_size: int
167
- lag_unit: str = "D"
168
-
169
- def to_formula(self) -> str:
170
- lag_component = f"lag_{self.lag_size}{self.lag_unit}"
171
- if self.offset_size > 0:
172
- lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
173
- return lag_component
174
-
175
- @classmethod
176
- def from_formula(cls, formula: str) -> Optional["Lag"]:
177
- import re
178
-
179
- # Try matching pattern with offset first
180
- pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
181
- match_with_offset = re.match(pattern_with_offset, formula)
182
-
183
- if match_with_offset:
184
- lag_size = int(match_with_offset.group(1))
185
- lag_unit = match_with_offset.group(2)
186
- offset_size = int(match_with_offset.group(3))
187
- offset_unit = match_with_offset.group(4)
188
-
189
- return cls(
190
- lag_size=lag_size,
191
- lag_unit=lag_unit,
192
- offset_size=offset_size,
193
- offset_unit=offset_unit,
194
- )
195
-
196
- # If no offset pattern found, try basic pattern
197
- pattern = r"^lag_(\d+)([a-zA-Z])$"
198
- match = re.match(pattern, formula)
199
-
200
- if not match:
201
- return None
202
-
203
- lag_size = int(match.group(1))
204
- lag_unit = match.group(2)
205
-
206
- return cls(lag_size=lag_size, lag_unit=lag_unit)
207
-
208
- def get_params(self) -> Dict[str, Optional[str]]:
209
- res = super().get_params()
210
- res.update(
211
- {
212
- "lag_size": self.lag_size,
213
- "lag_unit": self.lag_unit,
214
- }
215
- )
216
- return res
217
-
218
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
219
- lag_window = self.lag_size + 1
220
- return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
upgini/dataset.py CHANGED
@@ -40,7 +40,7 @@ from upgini.utils.email_utils import EmailSearchKeyConverter
40
40
  from upgini.utils.target_utils import (
41
41
  balance_undersample,
42
42
  balance_undersample_forced,
43
- balance_undersample_time_series,
43
+ balance_undersample_time_series_trunc,
44
44
  )
45
45
 
46
46
  try:
@@ -58,6 +58,8 @@ class Dataset: # (pd.DataFrame):
58
58
  FIT_SAMPLE_THRESHOLD = 200_000
59
59
  FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
60
60
  FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
61
+ FIT_SAMPLE_THRESHOLD_TS = 54_000
62
+ FIT_SAMPLE_ROWS_TS = 54_000
61
63
  BINARY_MIN_SAMPLE_THRESHOLD = 5_000
62
64
  MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
63
65
  IMBALANCE_THESHOLD = 0.6
@@ -301,7 +303,10 @@ class Dataset: # (pd.DataFrame):
301
303
  )
302
304
 
303
305
  # Resample over fit threshold
304
- if not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
306
+ if self.cv_type is not None and self.cv_type.is_time_series():
307
+ sample_threshold = self.FIT_SAMPLE_THRESHOLD_TS
308
+ sample_rows = self.FIT_SAMPLE_ROWS_TS
309
+ elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
305
310
  sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
306
311
  sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
307
312
  else:
@@ -314,7 +319,7 @@ class Dataset: # (pd.DataFrame):
314
319
  f"and will be downsampled to {sample_rows}"
315
320
  )
316
321
  if self.cv_type is not None and self.cv_type.is_time_series():
317
- resampled_data = balance_undersample_time_series(
322
+ resampled_data = balance_undersample_time_series_trunc(
318
323
  df=self.data,
319
324
  id_columns=self.id_columns,
320
325
  date_column=next(
@@ -584,10 +589,7 @@ class Dataset: # (pd.DataFrame):
584
589
  return search_customization
585
590
 
586
591
  def _rename_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
587
- if (
588
- runtime_parameters is not None
589
- and runtime_parameters.properties is not None
590
- ):
592
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
591
593
  if "generate_features" in runtime_parameters.properties:
592
594
  generate_features = runtime_parameters.properties["generate_features"].split(",")
593
595
  renamed_generate_features = []
@@ -607,6 +609,13 @@ class Dataset: # (pd.DataFrame):
607
609
 
608
610
  return runtime_parameters
609
611
 
612
+ def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
613
+ if runtime_parameters is not None and runtime_parameters.properties is not None:
614
+ if self.cv_type is not None and self.cv_type.is_time_series():
615
+ runtime_parameters.properties["sample_size"] = self.FIT_SAMPLE_ROWS_TS
616
+ runtime_parameters.properties["iter0_sample_size"] = self.FIT_SAMPLE_ROWS_TS
617
+ return runtime_parameters
618
+
610
619
  def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
611
620
  if (
612
621
  runtime_parameters is not None
@@ -638,6 +647,7 @@ class Dataset: # (pd.DataFrame):
638
647
  file_metrics = FileMetrics()
639
648
 
640
649
  runtime_parameters = self._rename_generate_features(runtime_parameters)
650
+ runtime_parameters = self._set_sample_size(runtime_parameters)
641
651
 
642
652
  file_metadata = self.__construct_metadata(exclude_features_sources)
643
653
  search_customization = self.__construct_search_customization(
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
31
31
  from sklearn.model_selection import BaseCrossValidator
32
32
 
33
33
  from upgini.autofe.feature import Feature
34
- from upgini.autofe.vector import TimeSeriesBase
34
+ from upgini.autofe.timeseries import TimeSeriesBase
35
35
  from upgini.data_source.data_source_publisher import CommercialSchema
36
36
  from upgini.dataset import Dataset
37
37
  from upgini.errors import HttpError, ValidationError
@@ -9,6 +9,7 @@ from upgini.errors import ValidationError
9
9
  from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
10
10
  from upgini.resource_bundle import ResourceBundle, bundle, get_custom_bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
+ from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
12
13
 
13
14
  TS_MIN_DIFFERENT_IDS_RATIO = 0.2
14
15
 
@@ -240,7 +241,7 @@ def balance_undersample_forced(
240
241
  df = df.copy().sort_values(by=SYSTEM_RECORD_ID)
241
242
  if cv_type is not None and cv_type.is_time_series():
242
243
  logger.warning(f"Sampling time series dataset from {len(df)} to {sample_size}")
243
- resampled_data = balance_undersample_time_series(
244
+ resampled_data = balance_undersample_time_series_trunc(
244
245
  df,
245
246
  id_columns=id_columns,
246
247
  date_column=date_column,
@@ -279,6 +280,58 @@ def balance_undersample_forced(
279
280
  return resampled_data
280
281
 
281
282
 
283
+ DEFAULT_HIGH_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=2, months=6), pd.DateOffset(years=2, days=7)]
284
+ DEFAULT_LOW_FREQ_TRUNC_LENGTHS = [pd.DateOffset(years=7), pd.DateOffset(years=5)]
285
+ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
286
+
287
+
288
+ def balance_undersample_time_series_trunc(
289
+ df: pd.DataFrame,
290
+ id_columns: List[str],
291
+ date_column: str,
292
+ sample_size: int,
293
+ random_state: int = 42,
294
+ logger: Optional[logging.Logger] = None,
295
+ highfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_HIGH_FREQ_TRUNC_LENGTHS,
296
+ lowfreq_trunc_lengths: List[pd.DateOffset] = DEFAULT_LOW_FREQ_TRUNC_LENGTHS,
297
+ time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
298
+ **kwargs,
299
+ ):
300
+ # Convert date column to datetime
301
+ dates_df = df[id_columns + [date_column]].copy()
302
+ dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
303
+
304
+ time_unit = get_most_frequent_time_unit(dates_df, id_columns, date_column)
305
+ if logger is not None:
306
+ logger.info(f"Time unit: {time_unit}")
307
+
308
+ if time_unit is None:
309
+ if logger is not None:
310
+ logger.info("Cannot detect time unit, returning original dataset")
311
+ return df
312
+
313
+ if time_unit < time_unit_threshold:
314
+ for trunc_length in highfreq_trunc_lengths:
315
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
316
+ if len(sampled_df) <= sample_size:
317
+ break
318
+ if len(sampled_df) > sample_size:
319
+ sampled_df = balance_undersample_time_series(
320
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
321
+ )
322
+ else:
323
+ for trunc_length in lowfreq_trunc_lengths:
324
+ sampled_df = trunc_datetime(dates_df, id_columns, date_column, trunc_length, logger=logger)
325
+ if len(sampled_df) <= sample_size:
326
+ break
327
+ if len(sampled_df) > sample_size:
328
+ sampled_df = balance_undersample_time_series(
329
+ sampled_df, id_columns, date_column, sample_size, random_state, logger=logger, **kwargs
330
+ )
331
+
332
+ return df.loc[sampled_df.index]
333
+
334
+
282
335
  def balance_undersample_time_series(
283
336
  df: pd.DataFrame,
284
337
  id_columns: List[str],
@@ -0,0 +1,41 @@
1
+ import logging
2
+ from typing import List, Optional
3
+ import pandas as pd
4
+
5
+
6
+ def get_most_frequent_time_unit(df: pd.DataFrame, id_columns: List[str], date_column: str) -> Optional[pd.DateOffset]:
7
+
8
+ def closest_unit(diff):
9
+ return pd.tseries.frequencies.to_offset(pd.Timedelta(diff, unit="s"))
10
+
11
+ all_diffs = []
12
+ groups = df.groupby(id_columns) if id_columns else [(None, df)]
13
+ for _, group in groups:
14
+ group_dates = group[date_column].sort_values().unique()
15
+ if len(group_dates) > 1:
16
+ diff_series = pd.Series(group_dates[1:] - group_dates[:-1])
17
+ diff_ns = diff_series.dt.total_seconds()
18
+ all_diffs.extend(diff_ns)
19
+
20
+ all_diffs = pd.Series(all_diffs)
21
+
22
+ most_frequent_unit = all_diffs.apply(closest_unit).mode().min()
23
+
24
+ return most_frequent_unit if isinstance(most_frequent_unit, pd.DateOffset) else None
25
+
26
+
27
+ def trunc_datetime(
28
+ df: pd.DataFrame,
29
+ id_columns: List[str],
30
+ date_column: str,
31
+ length: pd.DateOffset,
32
+ logger: Optional[logging.Logger] = None,
33
+ ) -> pd.DataFrame:
34
+ if logger is not None:
35
+ logger.info(f"Truncating time series dataset to {length}")
36
+
37
+ if id_columns:
38
+ min_datetime = df.groupby(id_columns)[date_column].transform(lambda group: group.max() - length)
39
+ else:
40
+ min_datetime = df[date_column].max() - length
41
+ return df[df[date_column] > min_datetime]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.61
3
+ Version: 1.2.62a3818.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=17s3XgKQ6UgMiFGNXwnQprj1EsjPUiE6QGnAzyDIfhs,23
1
+ upgini/__about__.py,sha256=-inFSOjK0otU7oAU9xIxafvjGaGWyHQqEAz5nWw5yqI,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
4
+ upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
6
+ upgini/features_enricher.py,sha256=cB2I5rNpbztjkYEEW5aJuKj2fCMnfxp40X4Eo63oyuQ,205340
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -14,14 +14,15 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
18
- upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
- upgini/autofe/date.py,sha256=pqwwk4_35RYXDT2fSJ9dlxGBm-R0jWBeiSb-79hZjkI,10721
20
- upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
21
- upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
22
- upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
23
- upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
17
+ upgini/autofe/all_operands.py,sha256=VIT5jCq5U-qypdNz1MIQ_hlIAs0ujJgRfKRUkU24nFs,332
18
+ upgini/autofe/binary.py,sha256=jsXa_zwlNWRmQAT5qipzU2Or03qae-a1kkY9yDECkq8,7660
19
+ upgini/autofe/date.py,sha256=bmoXU5vlDa1xsfCIFEC_VMRHOnV8Sy_KUMshqh0ARvA,10722
20
+ upgini/autofe/feature.py,sha256=n4sNNFM9b022AGJbW14AMRuERD9bwub-RWqa6hfLID0,14750
21
+ upgini/autofe/groupby.py,sha256=NN0T-tYbTHQDeCi2UZ06wVkDflm8DJBV4rdGrrVyVEE,3596
22
+ upgini/autofe/operator.py,sha256=VCGDUQ5bOtwX-jzmgHDrKF3GbglDumyEkvtLWTmSGQo,4776
23
+ upgini/autofe/timeseries.py,sha256=Pci7kNpFcViNZdIHlVTyxjoxzcMVdqUPopbPrJ3hE20,6593
24
+ upgini/autofe/unary.py,sha256=my7AYIrWCQPFxRtcphONmwieU5HpX4fHiKllFRCsMUk,4647
25
+ upgini/autofe/vector.py,sha256=5Lx2q_Np9PrMtZ_8O86xywq0s4XSQbooHxK3ufo3ANU,664
25
26
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
27
  upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
27
28
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -58,10 +59,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
58
59
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
59
60
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
60
61
  upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
61
- upgini/utils/target_utils.py,sha256=VsMdlS04_9SHlB2DPfSWTeqjc2JoXR5OPvu4qmvkmkg,14347
62
+ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
62
63
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
64
+ upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
63
65
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
64
- upgini-1.2.61.dist-info/METADATA,sha256=hH2eL4JHq8BjVpY3ZNFYDqUtKs5psdoiVM5jiXjs0yU,49084
65
- upgini-1.2.61.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
66
- upgini-1.2.61.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
67
- upgini-1.2.61.dist-info/RECORD,,
66
+ upgini-1.2.62a3818.dev1.dist-info/METADATA,sha256=9mRM2yQ18CeOTHQ83UgVmItZ-npsZSla3illeXSpyTQ,49094
67
+ upgini-1.2.62a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
68
+ upgini-1.2.62a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
69
+ upgini-1.2.62a3818.dev1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any