upgini 1.2.57a2__py3-none-any.whl → 1.2.57a3675.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.57a2"
1
+ __version__ = "1.2.57a3675.dev5"
upgini/autofe/date.py CHANGED
@@ -64,6 +64,9 @@ class DateDiff(PandasOperand, DateDiffMixin):
64
64
  return res
65
65
 
66
66
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
67
+ if left.isna().all() or right.isna().all():
68
+ return pd.Series([None] * len(left))
69
+
67
70
  left = self._convert_to_date(left, self.left_unit)
68
71
  right = self._convert_to_date(right, self.right_unit)
69
72
  diff = self._convert_diff_to_unit(left.dt.date - right.dt.date)
@@ -142,6 +145,9 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
142
145
  return cls(aggregation=aggregation)
143
146
 
144
147
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
148
+ if left.isna().all() or right.isna().all():
149
+ return pd.Series([None] * len(left), dtype=np.float64)
150
+
145
151
  left = self._convert_to_date(left, self.left_unit)
146
152
  right_mask = right.apply(lambda x: len(x) > 0)
147
153
  mask = left.notna() & right.notna() & right_mask
@@ -230,6 +236,8 @@ class DatePercentileBase(PandasOperand, abc.ABC):
230
236
  pass
231
237
 
232
238
  def _perc(self, f, bounds):
239
+ if f is None or np.isnan(f):
240
+ return np.nan
233
241
  hit = np.where(f >= np.array(bounds))[0]
234
242
  if hit.size > 0:
235
243
  return np.max(hit) + 1
upgini/autofe/feature.py CHANGED
@@ -26,18 +26,9 @@ class Column:
26
26
  return dict()
27
27
 
28
28
  def rename_columns(self, mapping: Dict[str, str]) -> "Column":
29
- self.name = self._unhash(mapping.get(self.name) or self.name)
29
+ self.name = mapping.get(self.name) or self.name
30
30
  return self
31
31
 
32
- def _unhash(self, feature_name: str) -> str:
33
- last_component_idx = feature_name.rfind("_")
34
- if not feature_name.startswith("f_"):
35
- return feature_name # etalon feature
36
- elif last_component_idx == 1:
37
- return feature_name[2:] # fully hashed name, cannot unhash
38
- else:
39
- return feature_name[2:last_component_idx]
40
-
41
32
  def delete_data(self):
42
33
  self.data = None
43
34
 
upgini/autofe/vector.py CHANGED
@@ -2,7 +2,11 @@ import abc
2
2
  from typing import Dict, List, Optional
3
3
 
4
4
  import pandas as pd
5
- from pydantic import validator
5
+
6
+ try:
7
+ from pydantic import field_validator as validator # V2
8
+ except ImportError:
9
+ from pydantic import validator # V1
6
10
 
7
11
  from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
8
12
 
@@ -29,12 +33,16 @@ class Sum(PandasOperand, VectorizableMixin):
29
33
  class TimeSeriesBase(PandasOperand, abc.ABC):
30
34
  is_vector: bool = True
31
35
  date_unit: Optional[str] = None
36
+ offset_size: int = 0
37
+ offset_unit: str = "D"
32
38
 
33
39
  def get_params(self) -> Dict[str, Optional[str]]:
34
40
  res = super().get_params()
35
41
  res.update(
36
42
  {
37
43
  "date_unit": self.date_unit,
44
+ "offset_size": self.offset_size,
45
+ "offset_unit": self.offset_unit,
38
46
  }
39
47
  )
40
48
  return res
@@ -46,13 +54,31 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
46
54
  ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
47
55
  ts.set_index(date.name, inplace=True)
48
56
  ts = ts[ts.index.notna()].sort_index()
49
- ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
57
+ ts = (
58
+ ts.groupby([c.name for c in data[1:-1]])
59
+ .apply(self._shift)[data[-1].name]
60
+ .to_frame()
61
+ .reset_index()
62
+ .set_index(date.name)
63
+ .groupby([c.name for c in data[1:-1]])
64
+ if len(data) > 2
65
+ else self._shift(ts)
66
+ )
50
67
  ts = self._aggregate(ts)
51
68
  ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
52
69
  ts.index = date.index
53
70
 
54
71
  return ts.iloc[:, -1]
55
72
 
73
+ def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
74
+ if self.offset_size > 0:
75
+ return ts.iloc[:, :-1].merge(
76
+ ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
77
+ left_index=True,
78
+ right_index=True,
79
+ )
80
+ return ts
81
+
56
82
  @abc.abstractmethod
57
83
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
58
84
  pass
@@ -67,6 +93,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
67
93
  window_unit: str = "D"
68
94
 
69
95
  @validator("window_unit")
96
+ @classmethod
70
97
  def validate_window_unit(cls, v: str) -> str:
71
98
  try:
72
99
  pd.tseries.frequencies.to_offset(v)
@@ -77,12 +104,35 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
77
104
  )
78
105
 
79
106
  def to_formula(self) -> str:
80
- return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
107
+ roll_component = f"roll_{self.window_size}{self.window_unit}"
108
+ if self.offset_size > 0:
109
+ roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
110
+ return f"{roll_component}_{self.aggregation}"
81
111
 
82
112
  @classmethod
83
113
  def from_formula(cls, formula: str) -> Optional["Roll"]:
84
114
  import re
85
115
 
116
+ # Try matching pattern with offset first
117
+ pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
118
+ match_with_offset = re.match(pattern_with_offset, formula)
119
+
120
+ if match_with_offset:
121
+ window_size = int(match_with_offset.group(1))
122
+ window_unit = match_with_offset.group(2)
123
+ offset_size = int(match_with_offset.group(3))
124
+ offset_unit = match_with_offset.group(4)
125
+ aggregation = match_with_offset.group(5)
126
+
127
+ return cls(
128
+ window_size=window_size,
129
+ window_unit=window_unit,
130
+ offset_size=offset_size,
131
+ offset_unit=offset_unit,
132
+ aggregation=aggregation,
133
+ )
134
+
135
+ # If no offset pattern found, try basic pattern
86
136
  pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
87
137
  match = re.match(pattern, formula)
88
138
 
@@ -107,7 +157,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
107
157
  return res
108
158
 
109
159
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
110
- return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
160
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
111
161
  _roll_aggregations.get(self.aggregation, self.aggregation)
112
162
  )
113
163
 
@@ -117,12 +167,33 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
117
167
  lag_unit: str = "D"
118
168
 
119
169
  def to_formula(self) -> str:
120
- return f"lag_{self.lag_size}{self.lag_unit}"
170
+ lag_component = f"lag_{self.lag_size}{self.lag_unit}"
171
+ if self.offset_size > 0:
172
+ lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
173
+ return lag_component
121
174
 
122
175
  @classmethod
123
176
  def from_formula(cls, formula: str) -> Optional["Lag"]:
124
177
  import re
125
178
 
179
+ # Try matching pattern with offset first
180
+ pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
181
+ match_with_offset = re.match(pattern_with_offset, formula)
182
+
183
+ if match_with_offset:
184
+ lag_size = int(match_with_offset.group(1))
185
+ lag_unit = match_with_offset.group(2)
186
+ offset_size = int(match_with_offset.group(3))
187
+ offset_unit = match_with_offset.group(4)
188
+
189
+ return cls(
190
+ lag_size=lag_size,
191
+ lag_unit=lag_unit,
192
+ offset_size=offset_size,
193
+ offset_unit=offset_unit,
194
+ )
195
+
196
+ # If no offset pattern found, try basic pattern
126
197
  pattern = r"^lag_(\d+)([a-zA-Z])$"
127
198
  match = re.match(pattern, formula)
128
199
 
@@ -136,6 +207,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
136
207
 
137
208
  def get_params(self) -> Dict[str, Optional[str]]:
138
209
  res = super().get_params()
210
+ res.update(
211
+ {
212
+ "lag_size": self.lag_size,
213
+ "lag_unit": self.lag_unit,
214
+ }
215
+ )
139
216
  return res
140
217
 
141
218
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.57a2
3
+ Version: 1.2.57a3675.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -34,7 +34,7 @@ Requires-Dist: pydantic<3.0.0,>1.0.0
34
34
  Requires-Dist: pyjwt>=2.8.0
35
35
  Requires-Dist: python-bidi==0.4.2
36
36
  Requires-Dist: python-dateutil>=2.8.0
37
- Requires-Dist: python-json-logger>=2.0.2
37
+ Requires-Dist: python-json-logger>=3.3.0
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
40
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
@@ -1,4 +1,4 @@
1
- upgini/__about__.py,sha256=PD2lbh5FQufk15oyUAYIGJrdUHAs9qG5Btw3lTqrUtI,25
1
+ upgini/__about__.py,sha256=I0ZAa2qUeGAG8w2GcOhss1hhvV9cMS2KXnSkGWg4s0A,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
@@ -16,12 +16,12 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
16
16
  upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operands.py,sha256=v0_NozalvvzeojSAA0d7UJ5INS654ZVaLn4S8djK6Ac,329
18
18
  upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
19
- upgini/autofe/date.py,sha256=d-sijAD7dETfqIOCaZh1vhuVjsS_nqa-6dhjwkCdny4,10441
20
- upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,15140
19
+ upgini/autofe/date.py,sha256=oykxfmny4LOr6m79IipOUCtk2JQSUdSCWHh8K9n7nek,10726
20
+ upgini/autofe/feature.py,sha256=zvRdlxCkaOsX0XiragNvh0tAPyOWut0MQTq5JGU5HtY,14749
21
21
  upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
22
22
  upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
23
23
  upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=bvcop9b0uFFPfQ3FLTwXT2IYfxNl4dIfR8icvnBHvOA,4358
24
+ upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
26
  upgini/data_source/data_source_publisher.py,sha256=0vaYz5v3KclJnA6jAWiTUiMQO5mbBTBINWV9jr2F5xM,22591
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.57a2.dist-info/METADATA,sha256=-dEVxWnjwc3LcSqFVJGENL07YJDvWgH8mHQ0PaE93sI,49057
63
- upgini-1.2.57a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
- upgini-1.2.57a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.57a2.dist-info/RECORD,,
62
+ upgini-1.2.57a3675.dev5.dist-info/METADATA,sha256=7bDZbjWy8pxCvyBM02xr2nvMfKLcDBh2Agf07aKc4fI,49065
63
+ upgini-1.2.57a3675.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
+ upgini-1.2.57a3675.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.57a3675.dev5.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.24.2
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any