upgini 1.2.62a3818.dev4__tar.gz → 1.2.65a3818.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/PKG-INFO +2 -1
  2. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/pyproject.toml +1 -0
  3. upgini-1.2.65a3818.dev5/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/delta.py +28 -34
  5. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/features_enricher.py +1 -1
  6. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/mstats.py +17 -2
  7. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/sort.py +4 -2
  8. upgini-1.2.62a3818.dev4/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/.gitignore +0 -0
  10. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/LICENSE +0 -0
  11. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/README.md +0 -0
  12. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/__init__.py +0 -0
  13. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/ads.py +0 -0
  14. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/operator.py +0 -0
  23. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/__init__.py +0 -0
  24. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/base.py +0 -0
  25. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/cross.py +0 -0
  26. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/lag.py +0 -0
  27. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/roll.py +0 -0
  28. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/trend.py +0 -0
  29. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/http.py +0 -0
  37. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/lazy_import.py +0 -0
  38. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/phone_utils.py +0 -0
  71. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/postal_code_utils.py +0 -0
  72. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/progress_bar.py +0 -0
  73. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/sklearn_ext.py +0 -0
  74. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/target_utils.py +0 -0
  75. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/track_info.py +0 -0
  76. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/ts_utils.py +0 -0
  77. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/utils/warning_counter.py +0 -0
  78. {upgini-1.2.62a3818.dev4 → upgini-1.2.65a3818.dev5}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.62a3818.dev4
3
+ Version: 1.2.65a3818.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -38,6 +38,7 @@ Requires-Dist: python-dateutil>=2.8.0
38
38
  Requires-Dist: python-json-logger>=3.3.0
39
39
  Requires-Dist: requests>=2.8.0
40
40
  Requires-Dist: scikit-learn>=1.3.0
41
+ Requires-Dist: scipy>=1.10.0
41
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
42
43
  Description-Content-Type: text/markdown
43
44
 
@@ -46,6 +46,7 @@ dependencies = [
46
46
  "python-json-logger>=3.3.0",
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
+ "scipy>=1.10.0",
49
50
  "python-bidi==0.4.2",
50
51
  "xhtml2pdf>=0.2.11,<0.3.0",
51
52
  "jarowinkler>=2.0.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.65a3818.dev5"
@@ -1,15 +1,34 @@
1
1
  import pandas as pd
2
- from typing import Dict, Optional
2
+ from typing import Dict, Optional, Union
3
3
 
4
4
  from upgini.autofe.operator import ParametrizedOperator
5
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
6
6
  from upgini.autofe.timeseries.lag import Lag
7
7
 
8
8
 
9
- class Delta(TimeSeriesBase, ParametrizedOperator):
9
+ class DeltaBase(TimeSeriesBase):
10
10
  delta_size: int
11
11
  delta_unit: str = "D"
12
12
 
13
+ def get_params(self) -> Dict[str, Optional[str]]:
14
+ res = super().get_params()
15
+ res.update(
16
+ {
17
+ "delta_size": self.delta_size,
18
+ "delta_unit": self.delta_unit,
19
+ }
20
+ )
21
+ return res
22
+
23
+ def _calculate_delta(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
24
+ return_series = isinstance(x, pd.Series)
25
+ x = pd.DataFrame(x)
26
+ lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
27
+ x.iloc[:, -1] = x.iloc[:, -1] - lag._aggregate(x.iloc[:, -1])
28
+ return x.iloc[:, -1] if return_series else x
29
+
30
+
31
+ class Delta(DeltaBase, ParametrizedOperator):
13
32
  def to_formula(self) -> str:
14
33
  base_formula = f"delta_{self.delta_size}{self.delta_unit}"
15
34
  return self._add_offset_to_formula(base_formula)
@@ -47,28 +66,11 @@ class Delta(TimeSeriesBase, ParametrizedOperator):
47
66
 
48
67
  return cls(**params)
49
68
 
50
- def get_params(self) -> Dict[str, Optional[str]]:
51
- res = super().get_params()
52
- res.update(
53
- {
54
- "delta_size": self.delta_size,
55
- "delta_unit": self.delta_unit,
56
- "offset_size": self.offset_size,
57
- "offset_unit": self.offset_unit,
58
- }
59
- )
60
- return res
61
-
62
69
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
63
- lag0 = Lag(lag_size=0, lag_unit=self.delta_unit)
64
- lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
65
- return lag0._aggregate(ts) - lag._aggregate(ts)
70
+ return ts.apply(self._calculate_delta).iloc[:, [-1]]
66
71
 
67
72
 
68
- class Delta2(TimeSeriesBase, ParametrizedOperator):
69
- delta_size: int
70
- delta_unit: str = "D"
71
-
73
+ class Delta2(DeltaBase, ParametrizedOperator):
72
74
  def to_formula(self) -> str:
73
75
  base_formula = f"delta2_{self.delta_size}{self.delta_unit}"
74
76
  return self._add_offset_to_formula(base_formula)
@@ -106,20 +108,12 @@ class Delta2(TimeSeriesBase, ParametrizedOperator):
106
108
 
107
109
  return cls(**params)
108
110
 
109
- def get_params(self) -> Dict[str, Optional[str]]:
110
- res = super().get_params()
111
- res.update(
112
- {
113
- "delta_size": self.delta_size,
114
- "delta_unit": self.delta_unit,
115
- }
116
- )
117
- return res
118
-
119
111
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
112
+ return ts.apply(self._calculate_delta2).iloc[:, [-1]]
113
+
114
+ def _calculate_delta2(self, x):
120
115
  # Calculate first delta
121
- delta1 = Delta(delta_size=self.delta_size, delta_unit=self.delta_unit)
122
- first_delta = delta1._aggregate(ts)
116
+ first_delta = self._calculate_delta(x)
123
117
 
124
118
  # Calculate delta of delta (second derivative)
125
- return delta1._aggregate(first_delta)
119
+ return self._calculate_delta(first_delta)
@@ -3632,7 +3632,7 @@ if response.status_code == 200:
3632
3632
  )
3633
3633
  do_sorting = False
3634
3634
  else:
3635
- columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3635
+ columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
3636
3636
  columns_to_hash = sort_columns(
3637
3637
  df[columns_to_hash],
3638
3638
  target_name,
@@ -3,7 +3,6 @@ from collections import namedtuple
3
3
 
4
4
  import numpy as np
5
5
  import numpy.ma as ma
6
- import scipy
7
6
  from joblib import Parallel, delayed
8
7
  from numpy import ndarray
9
8
  from psutil import cpu_count
@@ -116,6 +115,22 @@ def spearmanr(
116
115
  if nan_policy == "omit":
117
116
  x = mask_fn(x)
118
117
 
118
+ # - dof: degrees of freedom
119
+ # - t_stat: t-statistic
120
+ # - alternative: 'two-sided', 'greater', 'less'
121
+ def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
122
+ from scipy.stats import t
123
+
124
+ if alternative == "two-sided":
125
+ prob = 2 * t.sf(abs(t_stat), dof)
126
+ elif alternative == "greater":
127
+ prob = t.sf(t_stat, dof)
128
+ elif alternative == "less":
129
+ prob = t.cdf(t_stat, dof)
130
+ else:
131
+ raise ValueError(f"Unknown alternative: {alternative}")
132
+ return t_stat, prob
133
+
119
134
  def _spearmanr_2cols(x):
120
135
  # Mask the same observations for all variables, and then drop those
121
136
  # observations (can't leave them masked, rankdata is weird).
@@ -142,7 +157,7 @@ def spearmanr(
142
157
  # errors before taking the square root
143
158
  t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
159
 
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
160
+ t, prob = compute_t_pvalue(dof, t, alternative)
146
161
 
147
162
  # For backwards compatibility, return scalars when comparing 2 columns
148
163
  if rs.shape == (2, 2):
@@ -28,12 +28,13 @@ def sort_columns(
28
28
  logger = logging.getLogger(__name__)
29
29
  logger.setLevel(logging.FATAL)
30
30
  df = df.copy() # avoid side effects
31
+ search_keys = {k: v for k, v in search_keys.items() if v != SearchKey.CUSTOM_KEY}
31
32
 
32
33
  # Check multiple search keys
33
34
  search_key_values = list(search_keys.values())
34
35
  has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
36
  if has_duplicate_search_keys:
36
- logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
+ logger.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
38
 
38
39
  sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
40
  sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
@@ -68,8 +69,9 @@ def get_sort_columns_dict(
68
69
  if len(string_features) > 0:
69
70
  if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
71
  # factorize string features
72
+ df = df.copy()
71
73
  for c in string_features:
72
- df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
74
+ df = df.assign(**{c: pd.factorize(df[c], sort=True)[0].astype(int)})
73
75
  columns_for_sort.extend(string_features)
74
76
 
75
77
  if len(columns_for_sort) == 0:
@@ -1 +0,0 @@
1
- __version__ = "1.2.62a3818.dev4"