upgini 1.2.155.dev2__tar.gz → 1.2.156.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/PKG-INFO +1 -1
  2. upgini-1.2.156.dev1/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/date.py +30 -21
  4. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/delta.py +2 -0
  5. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/trend.py +5 -3
  6. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/volatility.py +4 -0
  7. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/unary.py +6 -9
  8. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/utils.py +43 -1
  9. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/feature_info.py +2 -1
  10. upgini-1.2.155.dev2/src/upgini/__about__.py +0 -1
  11. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/.gitignore +0 -0
  12. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/LICENSE +0 -0
  13. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/README.md +0 -0
  14. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/pyproject.toml +0 -0
  15. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/__init__.py +0 -0
  16. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/ads.py +0 -0
  17. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/all_operators.py +0 -0
  21. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/feature.py +0 -0
  23. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/groupby.py +0 -0
  24. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/operator.py +0 -0
  25. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/__init__.py +0 -0
  26. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/base.py +0 -0
  27. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/cross.py +0 -0
  28. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/lag.py +0 -0
  29. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/timeseries/roll.py +0 -0
  30. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/autofe/vector.py +0 -0
  31. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/data_source/__init__.py +0 -0
  32. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
  33. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/dataset.py +0 -0
  34. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/errors.py +0 -0
  35. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/features_enricher.py +0 -0
  36. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/http.py +0 -0
  37. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/metrics.py +0 -0
  41. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/normalizer/__init__.py +0 -0
  42. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
  43. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
  44. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
  45. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/resource_bundle/strings.properties +0 -0
  46. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/config.py +0 -0
  58. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/hash_utils.py +0 -0
  69. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/one_hot_encoder.py +0 -0
  72. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
  74. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/progress_bar.py +0 -0
  75. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/psi.py +0 -0
  76. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/sample_utils.py +0 -0
  77. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/sklearn_ext.py +0 -0
  78. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/sort.py +0 -0
  79. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/target_utils.py +0 -0
  80. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/track_info.py +0 -0
  81. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/ts_utils.py +0 -0
  82. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/utils/warning_counter.py +0 -0
  83. {upgini-1.2.155.dev2 → upgini-1.2.156.dev1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.155.dev2
3
+ Version: 1.2.156.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.156.dev1"
@@ -8,7 +8,7 @@ from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
10
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
11
- from upgini.autofe.utils import pydantic_validator
11
+ from upgini.autofe.utils import bin_index, bin_index_many, bin_index_vectorized, pydantic_validator
12
12
 
13
13
 
14
14
  def get_pydantic_version():
@@ -254,26 +254,33 @@ class DatePercentileBase(PandasOperator, abc.ABC):
254
254
  left = pd.to_datetime(left, unit=self.date_unit)
255
255
 
256
256
  bounds = self._get_bounds(left)
257
+ values = pd.to_numeric(right, errors="coerce").to_numpy(dtype=np.float64, copy=False)
258
+ bounds_list = bounds.tolist()
259
+ result = np.full(len(values), np.nan)
260
+
261
+ if not bounds_list:
262
+ return pd.Series(result, index=right.index).astype(pd.Int64Dtype()).astype("category")
263
+
264
+ bounds_lengths = {len(b) for b in bounds_list if isinstance(b, (list, np.ndarray))}
265
+ if len(bounds_lengths) == 1 and all(isinstance(b, (list, np.ndarray)) for b in bounds_list):
266
+ bounds_2d = np.asarray(bounds_list, dtype=np.float64)
267
+ if bounds_2d.ndim == 1:
268
+ result = bin_index_vectorized(values, bounds_2d)
269
+ else:
270
+ result = bin_index_many(values, bounds_2d)
271
+ else:
272
+ for i, row_bounds in enumerate(bounds_list):
273
+ if isinstance(row_bounds, (list, np.ndarray)) and len(row_bounds) > 0:
274
+ result[i] = bin_index(values[i], row_bounds)
257
275
 
258
- return (
259
- right.index.to_series()
260
- .apply(lambda i: self._perc(right[i], bounds[i]))
261
- .astype(pd.Int64Dtype())
262
- .astype("category")
263
- )
276
+ return pd.Series(result, index=right.index).astype(pd.Int64Dtype()).astype("category")
264
277
 
265
278
  @abc.abstractmethod
266
279
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
267
280
  pass
268
281
 
269
282
  def _perc(self, f, bounds):
270
- if f is None or np.isnan(f):
271
- return np.nan
272
- hit = np.where(f >= np.array(bounds))[0]
273
- if hit.size > 0:
274
- return np.max(hit) + 1
275
- else:
276
- return np.nan
283
+ return bin_index(f, bounds)
277
284
 
278
285
  def get_params(self) -> Dict[str, Optional[str]]:
279
286
  res = super().get_params()
@@ -313,13 +320,15 @@ class DatePercentile(DatePercentileBase):
313
320
  return value
314
321
 
315
322
  def _get_bounds(self, date_col: pd.Series) -> pd.Series:
316
- months = date_col.dt.month
317
- years = date_col.dt.year
318
-
319
- month_diffs = 12 * (years - (self.zero_year or 0)) + (months - (self.zero_month or 0))
320
- return month_diffs.apply(
321
- lambda d: np.array(self.zero_bounds if self.zero_bounds is not None else []) + d * self.step
322
- )
323
+ zero_bounds = self.zero_bounds if self.zero_bounds is not None else []
324
+ if not zero_bounds:
325
+ return pd.Series([[] for _ in range(len(date_col))], index=date_col.index)
326
+
327
+ month_diffs = (
328
+ 12 * (date_col.dt.year - (self.zero_year or 0)) + (date_col.dt.month - (self.zero_month or 0))
329
+ ).to_numpy()
330
+ bounds_2d = np.asarray(zero_bounds, dtype=np.float64) + month_diffs[:, None] * self.step
331
+ return pd.Series(list(bounds_2d), index=date_col.index)
323
332
 
324
333
 
325
334
  class DatePercentileMethod2(DatePercentileBase):
@@ -23,6 +23,8 @@ class DeltaBase(TimeSeriesBase):
23
23
  def _calculate_delta(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
24
24
  return_series = isinstance(x, pd.Series)
25
25
  x = pd.DataFrame(x)
26
+ value_col = x.columns[-1]
27
+ x[value_col] = pd.to_numeric(x[value_col], errors="coerce").astype("float64")
26
28
  lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
27
29
  x.iloc[:, -1] = x.iloc[:, -1] - lag._aggregate(x.iloc[:, -1])
28
30
  return x.iloc[:, -1] if return_series else x
@@ -54,12 +54,14 @@ class TrendCoefficient(TimeSeriesBase, ParametrizedOperator):
54
54
  return_series = isinstance(x, pd.Series)
55
55
  x = pd.DataFrame(x)
56
56
  resampled = (
57
- x.iloc[:, -1].resample(f"{self.step_size}{self.step_unit}").fillna(method="ffill").fillna(method="bfill")
57
+ x.iloc[:, -1].resample(f"{self.step_size}{self.step_unit}").ffill().bfill()
58
58
  )
59
59
  idx = np.arange(len(resampled))
60
+ value_col = x.columns[-1]
61
+ x[value_col] = pd.to_numeric(x[value_col], errors="coerce").astype("float64")
60
62
  try:
61
63
  coeffs = np.polyfit(idx, resampled, 1)
62
- x.iloc[:, -1] = coeffs[0]
64
+ x.iloc[:, -1] = float(coeffs[0])
63
65
  except np.linalg.LinAlgError:
64
- x.iloc[:, -1] = 0
66
+ x.iloc[:, -1] = 0.0
65
67
  return x.iloc[:, -1] if return_series else x
@@ -65,6 +65,8 @@ class EWMAVolatility(VolatilityBase, ParametrizedOperator):
65
65
  def _ewma_vol(self, x):
66
66
  return_series = isinstance(x, pd.Series)
67
67
  x = pd.DataFrame(x)
68
+ value_col = x.columns[-1]
69
+ x[value_col] = pd.to_numeric(x[value_col], errors="coerce").astype("float64")
68
70
  returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
69
71
  x.iloc[:, -1] = returns.ewm(span=self.window_size).std()
70
72
  return x.iloc[:, -1] if return_series else x
@@ -93,6 +95,8 @@ class RollingVolBase(VolatilityBase):
93
95
  ) -> Union[pd.DataFrame, pd.Series]:
94
96
  return_series = isinstance(x, pd.Series)
95
97
  x = pd.DataFrame(x)
98
+ value_col = x.columns[-1]
99
+ x[value_col] = pd.to_numeric(x[value_col], errors="coerce").astype("float64")
96
100
  returns = self._get_returns(x.iloc[:, -1], f"{self.step_size}{self.step_unit}")
97
101
  if abs_returns:
98
102
  returns = returns.abs()
@@ -4,7 +4,7 @@ import numpy as np
4
4
  import pandas as pd
5
5
 
6
6
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
7
- from upgini.autofe.utils import pydantic_validator
7
+ from upgini.autofe.utils import bin_index, bin_index_vectorized, pydantic_validator
8
8
 
9
9
 
10
10
  class Abs(PandasOperator, VectorizableMixin):
@@ -163,16 +163,13 @@ class Bin(PandasOperator):
163
163
  is_categorical: bool = True
164
164
 
165
165
  def calculate_unary(self, data: pd.Series) -> pd.Series:
166
- return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype("category")
166
+ bounds_arr = np.asarray(self.bin_bounds, dtype=np.float64)
167
+ values = pd.to_numeric(data, errors="coerce").to_numpy(dtype=np.float64, copy=False)
168
+ result = bin_index_vectorized(values, bounds_arr)
169
+ return pd.Series(result, index=data.index).fillna(-1).astype(int).astype("category")
167
170
 
168
171
  def _bin(self, f, bounds):
169
- if f is None or np.isnan(f):
170
- return np.nan
171
- hit = np.where(f >= np.array(bounds))[0]
172
- if hit.size > 0:
173
- return np.max(hit) + 1
174
- else:
175
- return np.nan
172
+ return bin_index(f, bounds)
176
173
 
177
174
  def get_params(self) -> Dict[str, Optional[str]]:
178
175
  res = super().get_params()
@@ -3,8 +3,9 @@ Utility functions for autofe module.
3
3
  """
4
4
 
5
5
  import functools
6
- from typing import Callable
6
+ from typing import Callable, Union
7
7
 
8
+ import numpy as np
8
9
  from pydantic import BaseModel
9
10
 
10
11
 
@@ -111,3 +112,44 @@ def pydantic_copy_method(obj):
111
112
  return obj.model_copy
112
113
  else:
113
114
  return obj.copy
115
+
116
+
117
+ def bin_index(value: Union[float, int, None], bounds) -> float:
118
+ if value is None or (isinstance(value, float) and np.isnan(value)):
119
+ return np.nan
120
+ bounds_arr = np.asarray(bounds, dtype=np.float64)
121
+ if bounds_arr.size == 0 or value < bounds_arr[0]:
122
+ return np.nan
123
+ return np.searchsorted(bounds_arr, value, side="right")
124
+
125
+
126
+ def bin_index_vectorized(values: np.ndarray, bounds: np.ndarray) -> np.ndarray:
127
+ n = len(values)
128
+ result = np.full(n, np.nan)
129
+ bounds_arr = np.asarray(bounds, dtype=np.float64)
130
+ if bounds_arr.size == 0:
131
+ return result
132
+ valid = ~np.isnan(values)
133
+ if not valid.any():
134
+ return result
135
+ valid_values = values[valid]
136
+ idx = np.searchsorted(bounds_arr, valid_values, side="right").astype(np.float64)
137
+ below = valid_values < bounds_arr[0]
138
+ if below.any():
139
+ idx[below] = np.nan
140
+ result[valid] = idx
141
+ return result
142
+
143
+
144
+ def bin_index_many(values: np.ndarray, bounds_2d: np.ndarray) -> np.ndarray:
145
+ n = len(values)
146
+ result = np.full(n, np.nan)
147
+ for i in range(n):
148
+ v = values[i]
149
+ if np.isnan(v):
150
+ continue
151
+ bounds_row = bounds_2d[i]
152
+ if bounds_row.size == 0 or v < bounds_row[0]:
153
+ continue
154
+ result[i] = np.searchsorted(bounds_row, v, side="right")
155
+ return result
@@ -104,7 +104,8 @@ def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.Data
104
104
  if data is not None and len(data) > 0 and feature_meta.name in data.columns:
105
105
  if len(data) > 3:
106
106
  rand = np.random.RandomState(42)
107
- feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
107
+ unique_values = sorted(data[feature_meta.name].dropna().unique(), key=str)
108
+ feature_sample = rand.choice(unique_values, 3, replace=False).tolist()
108
109
  else:
109
110
  feature_sample = data[feature_meta.name].dropna().unique().tolist()
110
111
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
@@ -1 +0,0 @@
1
- __version__ = "1.2.155.dev2"
File without changes
File without changes
File without changes