upgini 1.2.62a3818.dev4__py3-none-any.whl → 1.2.65a3818.dev5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.62a3818.dev4"
1
+ __version__ = "1.2.65a3818.dev5"
@@ -1,15 +1,34 @@
1
1
  import pandas as pd
2
- from typing import Dict, Optional
2
+ from typing import Dict, Optional, Union
3
3
 
4
4
  from upgini.autofe.operator import ParametrizedOperator
5
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
6
6
  from upgini.autofe.timeseries.lag import Lag
7
7
 
8
8
 
9
- class Delta(TimeSeriesBase, ParametrizedOperator):
9
+ class DeltaBase(TimeSeriesBase):
10
10
  delta_size: int
11
11
  delta_unit: str = "D"
12
12
 
13
+ def get_params(self) -> Dict[str, Optional[str]]:
14
+ res = super().get_params()
15
+ res.update(
16
+ {
17
+ "delta_size": self.delta_size,
18
+ "delta_unit": self.delta_unit,
19
+ }
20
+ )
21
+ return res
22
+
23
+ def _calculate_delta(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
24
+ return_series = isinstance(x, pd.Series)
25
+ x = pd.DataFrame(x)
26
+ lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
27
+ x.iloc[:, -1] = x.iloc[:, -1] - lag._aggregate(x.iloc[:, -1])
28
+ return x.iloc[:, -1] if return_series else x
29
+
30
+
31
+ class Delta(DeltaBase, ParametrizedOperator):
13
32
  def to_formula(self) -> str:
14
33
  base_formula = f"delta_{self.delta_size}{self.delta_unit}"
15
34
  return self._add_offset_to_formula(base_formula)
@@ -47,28 +66,11 @@ class Delta(TimeSeriesBase, ParametrizedOperator):
47
66
 
48
67
  return cls(**params)
49
68
 
50
- def get_params(self) -> Dict[str, Optional[str]]:
51
- res = super().get_params()
52
- res.update(
53
- {
54
- "delta_size": self.delta_size,
55
- "delta_unit": self.delta_unit,
56
- "offset_size": self.offset_size,
57
- "offset_unit": self.offset_unit,
58
- }
59
- )
60
- return res
61
-
62
69
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
63
- lag0 = Lag(lag_size=0, lag_unit=self.delta_unit)
64
- lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
65
- return lag0._aggregate(ts) - lag._aggregate(ts)
70
+ return ts.apply(self._calculate_delta).iloc[:, [-1]]
66
71
 
67
72
 
68
- class Delta2(TimeSeriesBase, ParametrizedOperator):
69
- delta_size: int
70
- delta_unit: str = "D"
71
-
73
+ class Delta2(DeltaBase, ParametrizedOperator):
72
74
  def to_formula(self) -> str:
73
75
  base_formula = f"delta2_{self.delta_size}{self.delta_unit}"
74
76
  return self._add_offset_to_formula(base_formula)
@@ -106,20 +108,12 @@ class Delta2(TimeSeriesBase, ParametrizedOperator):
106
108
 
107
109
  return cls(**params)
108
110
 
109
- def get_params(self) -> Dict[str, Optional[str]]:
110
- res = super().get_params()
111
- res.update(
112
- {
113
- "delta_size": self.delta_size,
114
- "delta_unit": self.delta_unit,
115
- }
116
- )
117
- return res
118
-
119
111
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
112
+ return ts.apply(self._calculate_delta2).iloc[:, [-1]]
113
+
114
+ def _calculate_delta2(self, x):
120
115
  # Calculate first delta
121
- delta1 = Delta(delta_size=self.delta_size, delta_unit=self.delta_unit)
122
- first_delta = delta1._aggregate(ts)
116
+ first_delta = self._calculate_delta(x)
123
117
 
124
118
  # Calculate delta of delta (second derivative)
125
- return delta1._aggregate(first_delta)
119
+ return self._calculate_delta(first_delta)
@@ -3632,7 +3632,7 @@ if response.status_code == 200:
3632
3632
  )
3633
3633
  do_sorting = False
3634
3634
  else:
3635
- columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3635
+ columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
3636
3636
  columns_to_hash = sort_columns(
3637
3637
  df[columns_to_hash],
3638
3638
  target_name,
upgini/utils/mstats.py CHANGED
@@ -3,7 +3,6 @@ from collections import namedtuple
3
3
 
4
4
  import numpy as np
5
5
  import numpy.ma as ma
6
- import scipy
7
6
  from joblib import Parallel, delayed
8
7
  from numpy import ndarray
9
8
  from psutil import cpu_count
@@ -116,6 +115,22 @@ def spearmanr(
116
115
  if nan_policy == "omit":
117
116
  x = mask_fn(x)
118
117
 
118
+ # - dof: degrees of freedom
119
+ # - t_stat: t-statistic
120
+ # - alternative: 'two-sided', 'greater', 'less'
121
+ def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
122
+ from scipy.stats import t
123
+
124
+ if alternative == "two-sided":
125
+ prob = 2 * t.sf(abs(t_stat), dof)
126
+ elif alternative == "greater":
127
+ prob = t.sf(t_stat, dof)
128
+ elif alternative == "less":
129
+ prob = t.cdf(t_stat, dof)
130
+ else:
131
+ raise ValueError(f"Unknown alternative: {alternative}")
132
+ return t_stat, prob
133
+
119
134
  def _spearmanr_2cols(x):
120
135
  # Mask the same observations for all variables, and then drop those
121
136
  # observations (can't leave them masked, rankdata is weird).
@@ -142,7 +157,7 @@ def spearmanr(
142
157
  # errors before taking the square root
143
158
  t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
159
 
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
160
+ t, prob = compute_t_pvalue(dof, t, alternative)
146
161
 
147
162
  # For backwards compatibility, return scalars when comparing 2 columns
148
163
  if rs.shape == (2, 2):
upgini/utils/sort.py CHANGED
@@ -28,12 +28,13 @@ def sort_columns(
28
28
  logger = logging.getLogger(__name__)
29
29
  logger.setLevel(logging.FATAL)
30
30
  df = df.copy() # avoid side effects
31
+ search_keys = {k: v for k, v in search_keys.items() if v != SearchKey.CUSTOM_KEY}
31
32
 
32
33
  # Check multiple search keys
33
34
  search_key_values = list(search_keys.values())
34
35
  has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
36
  if has_duplicate_search_keys:
36
- logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
+ logger.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
38
 
38
39
  sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
40
  sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
@@ -68,8 +69,9 @@ def get_sort_columns_dict(
68
69
  if len(string_features) > 0:
69
70
  if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
71
  # factorize string features
72
+ df = df.copy()
71
73
  for c in string_features:
72
- df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
74
+ df = df.assign(**{c: pd.factorize(df[c], sort=True)[0].astype(int)})
73
75
  columns_for_sort.extend(string_features)
74
76
 
75
77
  if len(columns_for_sort) == 0:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.62a3818.dev4
3
+ Version: 1.2.65a3818.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -38,6 +38,7 @@ Requires-Dist: python-dateutil>=2.8.0
38
38
  Requires-Dist: python-json-logger>=3.3.0
39
39
  Requires-Dist: requests>=2.8.0
40
40
  Requires-Dist: scikit-learn>=1.3.0
41
+ Requires-Dist: scipy>=1.10.0
41
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
42
43
  Description-Content-Type: text/markdown
43
44
 
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=DRRGQ9hjWuzUUDq0H9hZpymmoGVeS9BXeeOQ2XoHmjc,33
1
+ upgini/__about__.py,sha256=WOd200tkoJAB3TJo-txEZDg-U6YHThQugeQsJTyxs64,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=cB2I5rNpbztjkYEEW5aJuKj2fCMnfxp40X4Eo63oyuQ,205340
6
+ upgini/features_enricher.py,sha256=nXGBMC42VPAmqQKXbEqZJFIHiGj6F_G2AwhurA8LuQs,205351
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -25,7 +25,7 @@ upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
25
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
26
26
  upgini/autofe/timeseries/base.py,sha256=T9Ec8LKJbiwTUGGsd_xhM0U0NUJblqmKchkzUI1sK88,3755
27
27
  upgini/autofe/timeseries/cross.py,sha256=8ggDhsvwdxHkrWKRPl2fcFt7wamTYhkVzQcOWvIIyvU,4612
28
- upgini/autofe/timeseries/delta.py,sha256=hXEiFWHdZndz8I7Ef5zhTHLJac9illhZOZITwpL9ppw,3618
28
+ upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
29
29
  upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
30
30
  upgini/autofe/timeseries/roll.py,sha256=bNFMDszSYTWvB7EyhHbRY1DJqzSURvHlPAcBebt0y0Y,2878
31
31
  upgini/autofe/timeseries/trend.py,sha256=eP0q1fBW4MYPrjfy7vr88tTG8qk0xypClaGHaVv1hAs,1962
@@ -60,17 +60,17 @@ upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,
60
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
62
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
63
- upgini/utils/mstats.py,sha256=GjBAUacgfAoVQVFUrMiRYdVkmx93CIThLRNvYLLiV48,5765
63
+ upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
68
- upgini/utils/sort.py,sha256=w-CoT33W_53ekOROpKI_VRsRmiyWNr2b3IpE5_4MLLA,6395
68
+ upgini/utils/sort.py,sha256=GfWfCIbfK7e7BvSPZZNJD-PEtiN19DnTCEQkeefHHxI,6491
69
69
  upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.62a3818.dev4.dist-info/METADATA,sha256=_sL9eQLnB5X1kyhbUiMzXIB5HUgK0KFfmuwgp3Su59c,49094
74
- upgini-1.2.62a3818.dev4.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
- upgini-1.2.62a3818.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.62a3818.dev4.dist-info/RECORD,,
73
+ upgini-1.2.65a3818.dev5.dist-info/METADATA,sha256=ThM2VKSF3JJw-jQz1wSWOQh81T7FsB4FvnjB5r9bcKk,49123
74
+ upgini-1.2.65a3818.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
+ upgini-1.2.65a3818.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.65a3818.dev5.dist-info/RECORD,,