upgini 1.2.63__py3-none-any.whl → 1.2.65__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/autofe/vector.py CHANGED
@@ -1,17 +1,11 @@
1
- import abc
2
- from typing import Dict, List, Optional
1
+ from typing import List, Optional
3
2
 
4
3
  import pandas as pd
5
4
 
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
5
+ from upgini.autofe.operator import PandasOperator, VectorizableMixin
10
6
 
11
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
12
7
 
13
-
14
- class Mean(PandasOperand, VectorizableMixin):
8
+ class Mean(PandasOperator, VectorizableMixin):
15
9
  name: str = "mean"
16
10
  output_type: Optional[str] = "float"
17
11
  is_vector: bool = True
@@ -21,200 +15,10 @@ class Mean(PandasOperand, VectorizableMixin):
21
15
  return pd.DataFrame(data).T.fillna(0).mean(axis=1)
22
16
 
23
17
 
24
- class Sum(PandasOperand, VectorizableMixin):
18
+ class Sum(PandasOperator, VectorizableMixin):
25
19
  name: str = "sum"
26
20
  is_vector: bool = True
27
21
  group_index: int = 0
28
22
 
29
23
  def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
30
24
  return pd.DataFrame(data).T.fillna(0).sum(axis=1)
31
-
32
-
33
- class TimeSeriesBase(PandasOperand, abc.ABC):
34
- is_vector: bool = True
35
- date_unit: Optional[str] = None
36
- offset_size: int = 0
37
- offset_unit: str = "D"
38
-
39
- def get_params(self) -> Dict[str, Optional[str]]:
40
- res = super().get_params()
41
- res.update(
42
- {
43
- "date_unit": self.date_unit,
44
- "offset_size": self.offset_size,
45
- "offset_unit": self.offset_unit,
46
- }
47
- )
48
- return res
49
-
50
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
51
- # assuming first is date, last is value, rest is group columns
52
- date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
53
- ts = pd.concat([date] + data[1:], axis=1)
54
- ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
55
- ts.set_index(date.name, inplace=True)
56
- ts = ts[ts.index.notna()].sort_index()
57
- ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
59
- .apply(self._shift)[data[-1].name]
60
- .to_frame()
61
- .reset_index()
62
- .set_index(date.name)
63
- .groupby([c.name for c in data[1:-1]])
64
- if len(data) > 2
65
- else self._shift(ts)
66
- )
67
- ts = self._aggregate(ts)
68
- ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
69
- ts.index = date.index
70
-
71
- return ts.iloc[:, -1]
72
-
73
- def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
74
- if self.offset_size > 0:
75
- return ts.iloc[:, :-1].merge(
76
- ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
77
- left_index=True,
78
- right_index=True,
79
- )
80
- return ts
81
-
82
- @abc.abstractmethod
83
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
84
- pass
85
-
86
-
87
- _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
88
-
89
-
90
- class Roll(TimeSeriesBase, ParametrizedOperand):
91
- aggregation: str
92
- window_size: int = 1
93
- window_unit: str = "D"
94
-
95
- @validator("window_unit")
96
- @classmethod
97
- def validate_window_unit(cls, v: str) -> str:
98
- try:
99
- pd.tseries.frequencies.to_offset(v)
100
- return v
101
- except ValueError:
102
- raise ValueError(
103
- f"Invalid window_unit: {v}. Must be a valid pandas frequency string (e.g. 'D', 'H', 'T', etc)"
104
- )
105
-
106
- def to_formula(self) -> str:
107
- roll_component = f"roll_{self.window_size}{self.window_unit}"
108
- if self.offset_size > 0:
109
- roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
110
- return f"{roll_component}_{self.aggregation}"
111
-
112
- @classmethod
113
- def from_formula(cls, formula: str) -> Optional["Roll"]:
114
- import re
115
-
116
- # Try matching pattern with offset first
117
- pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
118
- match_with_offset = re.match(pattern_with_offset, formula)
119
-
120
- if match_with_offset:
121
- window_size = int(match_with_offset.group(1))
122
- window_unit = match_with_offset.group(2)
123
- offset_size = int(match_with_offset.group(3))
124
- offset_unit = match_with_offset.group(4)
125
- aggregation = match_with_offset.group(5)
126
-
127
- return cls(
128
- window_size=window_size,
129
- window_unit=window_unit,
130
- offset_size=offset_size,
131
- offset_unit=offset_unit,
132
- aggregation=aggregation,
133
- )
134
-
135
- # If no offset pattern found, try basic pattern
136
- pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
137
- match = re.match(pattern, formula)
138
-
139
- if not match:
140
- return None
141
-
142
- window_size = int(match.group(1))
143
- window_unit = match.group(2)
144
- aggregation = match.group(3)
145
-
146
- return cls(window_size=window_size, window_unit=window_unit, aggregation=aggregation)
147
-
148
- def get_params(self) -> Dict[str, Optional[str]]:
149
- res = super().get_params()
150
- res.update(
151
- {
152
- "window_size": self.window_size,
153
- "window_unit": self.window_unit,
154
- "aggregation": self.aggregation,
155
- }
156
- )
157
- return res
158
-
159
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
160
- return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
161
- _roll_aggregations.get(self.aggregation, self.aggregation)
162
- )
163
-
164
-
165
- class Lag(TimeSeriesBase, ParametrizedOperand):
166
- lag_size: int
167
- lag_unit: str = "D"
168
-
169
- def to_formula(self) -> str:
170
- lag_component = f"lag_{self.lag_size}{self.lag_unit}"
171
- if self.offset_size > 0:
172
- lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
173
- return lag_component
174
-
175
- @classmethod
176
- def from_formula(cls, formula: str) -> Optional["Lag"]:
177
- import re
178
-
179
- # Try matching pattern with offset first
180
- pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
181
- match_with_offset = re.match(pattern_with_offset, formula)
182
-
183
- if match_with_offset:
184
- lag_size = int(match_with_offset.group(1))
185
- lag_unit = match_with_offset.group(2)
186
- offset_size = int(match_with_offset.group(3))
187
- offset_unit = match_with_offset.group(4)
188
-
189
- return cls(
190
- lag_size=lag_size,
191
- lag_unit=lag_unit,
192
- offset_size=offset_size,
193
- offset_unit=offset_unit,
194
- )
195
-
196
- # If no offset pattern found, try basic pattern
197
- pattern = r"^lag_(\d+)([a-zA-Z])$"
198
- match = re.match(pattern, formula)
199
-
200
- if not match:
201
- return None
202
-
203
- lag_size = int(match.group(1))
204
- lag_unit = match.group(2)
205
-
206
- return cls(lag_size=lag_size, lag_unit=lag_unit)
207
-
208
- def get_params(self) -> Dict[str, Optional[str]]:
209
- res = super().get_params()
210
- res.update(
211
- {
212
- "lag_size": self.lag_size,
213
- "lag_unit": self.lag_unit,
214
- }
215
- )
216
- return res
217
-
218
- def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
219
- lag_window = self.lag_size + 1
220
- return ts.rolling(f"{lag_window}{self.lag_unit}", min_periods=lag_window).agg(lambda x: x[0])
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
31
31
  from sklearn.model_selection import BaseCrossValidator
32
32
 
33
33
  from upgini.autofe.feature import Feature
34
- from upgini.autofe.vector import TimeSeriesBase
34
+ from upgini.autofe.timeseries import TimeSeriesBase
35
35
  from upgini.data_source.data_source_publisher import CommercialSchema
36
36
  from upgini.dataset import Dataset
37
37
  from upgini.errors import HttpError, ValidationError
@@ -3632,7 +3632,7 @@ if response.status_code == 200:
3632
3632
  )
3633
3633
  do_sorting = False
3634
3634
  else:
3635
- columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3635
+ columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
3636
3636
  columns_to_hash = sort_columns(
3637
3637
  df[columns_to_hash],
3638
3638
  target_name,
upgini/utils/sort.py CHANGED
@@ -28,12 +28,13 @@ def sort_columns(
28
28
  logger = logging.getLogger(__name__)
29
29
  logger.setLevel(logging.FATAL)
30
30
  df = df.copy() # avoid side effects
31
+ search_keys = {k: v for k, v in search_keys.items() if v != SearchKey.CUSTOM_KEY}
31
32
 
32
33
  # Check multiple search keys
33
34
  search_key_values = list(search_keys.values())
34
35
  has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
36
  if has_duplicate_search_keys:
36
- logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
+ logger.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
38
 
38
39
  sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
40
  sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
@@ -68,8 +69,9 @@ def get_sort_columns_dict(
68
69
  if len(string_features) > 0:
69
70
  if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
71
  # factorize string features
72
+ df = df.copy()
71
73
  for c in string_features:
72
- df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
74
+ df = df.assign(**{c: pd.factorize(df[c], sort=True)[0].astype(int)})
73
75
  columns_for_sort.extend(string_features)
74
76
 
75
77
  if len(columns_for_sort) == 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.63
3
+ Version: 1.2.65
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=jIcsQGjL4QjnLFsRkdEHc7S78dfQHi-auHwc_P5Xftc,23
1
+ upgini/__about__.py,sha256=9LdiugHjYADPBPHXjA5mj8ce_XDBj0fp-oIlGtPl5HI,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=2AMEXtoMrEFw3f0b0CsvkFyS1a7L4aqI2GO_fCsgWac,205336
6
+ upgini/features_enricher.py,sha256=nXGBMC42VPAmqQKXbEqZJFIHiGj6F_G2AwhurA8LuQs,205351
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -14,14 +14,22 @@ upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1
14
14
  upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
15
15
  upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
18
- upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
- upgini/autofe/date.py,sha256=pqwwk4_35RYXDT2fSJ9dlxGBm-R0jWBeiSb-79hZjkI,10721
20
- upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
21
- upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
22
- upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
23
- upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
17
+ upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
+ upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
+ upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
20
+ upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
21
+ upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
+ upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
+ upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
24
+ upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
25
+ upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
+ upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
27
+ upgini/autofe/timeseries/cross.py,sha256=Sh5hAXZFWKaFRqf_JGODu9pWO2tmuV5VKyK9eX3i7-I,4931
28
+ upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
29
+ upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
30
+ upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
31
+ upgini/autofe/timeseries/trend.py,sha256=9p2Q5ByAi6cx9RH9teBTe8FyjSzqthznC2Lo5dsJ0ho,2051
32
+ upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
25
33
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
34
  upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
27
35
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -57,12 +65,12 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
57
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
58
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
59
67
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
60
- upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
68
+ upgini/utils/sort.py,sha256=GfWfCIbfK7e7BvSPZZNJD-PEtiN19DnTCEQkeefHHxI,6491
61
69
  upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
62
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
63
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
64
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
65
- upgini-1.2.63.dist-info/METADATA,sha256=nH5TvEpkQ7qCwZi9uFN6qThiBIe3jLgLCIeRtZeflnA,49113
66
- upgini-1.2.63.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
67
- upgini-1.2.63.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
68
- upgini-1.2.63.dist-info/RECORD,,
73
+ upgini-1.2.65.dist-info/METADATA,sha256=GGxmpRnHQUTsCQlWPZeNL2xk27XWuEWrvECLPVEx5vU,49113
74
+ upgini-1.2.65.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
+ upgini-1.2.65.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.65.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any