upgini 1.2.62a3818.dev4__tar.gz → 1.2.65__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (78) hide show
  1. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/PKG-INFO +2 -1
  2. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/pyproject.toml +1 -0
  3. upgini-1.2.65/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/feature.py +2 -2
  5. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/operator.py +3 -0
  6. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/cross.py +11 -2
  7. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/delta.py +28 -34
  8. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/trend.py +5 -2
  9. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/features_enricher.py +1 -1
  10. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/mstats.py +17 -2
  11. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/sort.py +4 -2
  12. upgini-1.2.62a3818.dev4/src/upgini/__about__.py +0 -1
  13. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/.gitignore +0 -0
  14. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/LICENSE +0 -0
  15. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/README.md +0 -0
  16. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/__init__.py +0 -0
  17. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/ads.py +0 -0
  18. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/ads_management/__init__.py +0 -0
  19. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/ads_management/ads_manager.py +0 -0
  20. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/__init__.py +0 -0
  21. /upgini-1.2.62a3818.dev4/src/upgini/autofe/all_operands.py → /upgini-1.2.65/src/upgini/autofe/all_operators.py +0 -0
  22. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/binary.py +0 -0
  23. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/date.py +0 -0
  24. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/__init__.py +0 -0
  26. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/base.py +0 -0
  27. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/lag.py +0 -0
  28. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/roll.py +0 -0
  29. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/timeseries/volatility.py +0 -0
  30. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/unary.py +0 -0
  31. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/http.py +0 -0
  37. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/lazy_import.py +0 -0
  38. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/phone_utils.py +0 -0
  71. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/postal_code_utils.py +0 -0
  72. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/progress_bar.py +0 -0
  73. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/sklearn_ext.py +0 -0
  74. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/target_utils.py +0 -0
  75. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/track_info.py +0 -0
  76. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/ts_utils.py +0 -0
  77. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/utils/warning_counter.py +0 -0
  78. {upgini-1.2.62a3818.dev4 → upgini-1.2.65}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.62a3818.dev4
3
+ Version: 1.2.65
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -38,6 +38,7 @@ Requires-Dist: python-dateutil>=2.8.0
38
38
  Requires-Dist: python-json-logger>=3.3.0
39
39
  Requires-Dist: requests>=2.8.0
40
40
  Requires-Dist: scikit-learn>=1.3.0
41
+ Requires-Dist: scipy>=1.10.0
41
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
42
43
  Description-Content-Type: text/markdown
43
44
 
@@ -46,6 +46,7 @@ dependencies = [
46
46
  "python-json-logger>=3.3.0",
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
+ "scipy>=1.10.0",
49
50
  "python-bidi==0.4.2",
50
51
  "xhtml2pdf>=0.2.11,<0.3.0",
51
52
  "jarowinkler>=2.0.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.65"
@@ -6,7 +6,7 @@ import numpy as np
6
6
  import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
- from upgini.autofe.all_operands import find_op
9
+ from upgini.autofe.all_operators import find_op
10
10
  from upgini.autofe.operator import Operator, PandasOperator
11
11
 
12
12
 
@@ -112,7 +112,7 @@ class Feature:
112
112
 
113
113
  def get_hash(self) -> str:
114
114
  return hashlib.sha256(
115
- "_".join([self.op.to_formula()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
115
+ "_".join([self.op.get_hash_component()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
116
116
  ).hexdigest()[:8]
117
117
 
118
118
  def set_alias(self, alias: str) -> "Feature":
@@ -76,6 +76,9 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
76
76
  def to_formula(self) -> str:
77
77
  return self.name
78
78
 
79
+ def get_hash_component(self) -> str:
80
+ return self.to_formula()
81
+
79
82
 
80
83
  class ParametrizedOperator(Operator, abc.ABC):
81
84
 
@@ -8,7 +8,7 @@ try:
8
8
  except ImportError:
9
9
  from pydantic import validator # V1
10
10
 
11
- from upgini.autofe.all_operands import find_op
11
+ from upgini.autofe.all_operators import find_op
12
12
  from upgini.autofe.operator import PandasOperator, ParametrizedOperator
13
13
  from upgini.autofe.timeseries.base import TimeSeriesBase
14
14
 
@@ -95,6 +95,15 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
95
95
  )
96
96
  return res
97
97
 
98
+ def get_hash_component(self) -> str:
99
+ inner_components = [
100
+ self.to_formula(),
101
+ "_".join(str(i) for i in self.descriptor_indices),
102
+ "_".join(self.left_descriptor),
103
+ "_".join(self.right_descriptor),
104
+ ]
105
+ return "_".join(inner_components)
106
+
98
107
  def _get_interaction_op_name(self) -> str:
99
108
  return self.interaction_op.alias or self.interaction_op.to_formula()
100
109
 
@@ -113,7 +122,7 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
113
122
  return res
114
123
 
115
124
  def _get_mask(self, data: List[pd.Series], descriptor: List[str]) -> pd.Series:
116
- mask = np.logical_and.reduce([data[i] == v for i, v in zip(self.descriptor_indices, descriptor)])
125
+ mask = np.logical_and.reduce([data[i].astype(str) == v for i, v in zip(self.descriptor_indices, descriptor)])
117
126
  return mask
118
127
 
119
128
  def _extract_series(self, data: List[pd.Series], mask: pd.Series) -> pd.Series:
@@ -1,15 +1,34 @@
1
1
  import pandas as pd
2
- from typing import Dict, Optional
2
+ from typing import Dict, Optional, Union
3
3
 
4
4
  from upgini.autofe.operator import ParametrizedOperator
5
5
  from upgini.autofe.timeseries.base import TimeSeriesBase
6
6
  from upgini.autofe.timeseries.lag import Lag
7
7
 
8
8
 
9
- class Delta(TimeSeriesBase, ParametrizedOperator):
9
+ class DeltaBase(TimeSeriesBase):
10
10
  delta_size: int
11
11
  delta_unit: str = "D"
12
12
 
13
+ def get_params(self) -> Dict[str, Optional[str]]:
14
+ res = super().get_params()
15
+ res.update(
16
+ {
17
+ "delta_size": self.delta_size,
18
+ "delta_unit": self.delta_unit,
19
+ }
20
+ )
21
+ return res
22
+
23
+ def _calculate_delta(self, x: Union[pd.DataFrame, pd.Series]) -> Union[pd.DataFrame, pd.Series]:
24
+ return_series = isinstance(x, pd.Series)
25
+ x = pd.DataFrame(x)
26
+ lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
27
+ x.iloc[:, -1] = x.iloc[:, -1] - lag._aggregate(x.iloc[:, -1])
28
+ return x.iloc[:, -1] if return_series else x
29
+
30
+
31
+ class Delta(DeltaBase, ParametrizedOperator):
13
32
  def to_formula(self) -> str:
14
33
  base_formula = f"delta_{self.delta_size}{self.delta_unit}"
15
34
  return self._add_offset_to_formula(base_formula)
@@ -47,28 +66,11 @@ class Delta(TimeSeriesBase, ParametrizedOperator):
47
66
 
48
67
  return cls(**params)
49
68
 
50
- def get_params(self) -> Dict[str, Optional[str]]:
51
- res = super().get_params()
52
- res.update(
53
- {
54
- "delta_size": self.delta_size,
55
- "delta_unit": self.delta_unit,
56
- "offset_size": self.offset_size,
57
- "offset_unit": self.offset_unit,
58
- }
59
- )
60
- return res
61
-
62
69
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
63
- lag0 = Lag(lag_size=0, lag_unit=self.delta_unit)
64
- lag = Lag(lag_size=self.delta_size, lag_unit=self.delta_unit)
65
- return lag0._aggregate(ts) - lag._aggregate(ts)
70
+ return ts.apply(self._calculate_delta).iloc[:, [-1]]
66
71
 
67
72
 
68
- class Delta2(TimeSeriesBase, ParametrizedOperator):
69
- delta_size: int
70
- delta_unit: str = "D"
71
-
73
+ class Delta2(DeltaBase, ParametrizedOperator):
72
74
  def to_formula(self) -> str:
73
75
  base_formula = f"delta2_{self.delta_size}{self.delta_unit}"
74
76
  return self._add_offset_to_formula(base_formula)
@@ -106,20 +108,12 @@ class Delta2(TimeSeriesBase, ParametrizedOperator):
106
108
 
107
109
  return cls(**params)
108
110
 
109
- def get_params(self) -> Dict[str, Optional[str]]:
110
- res = super().get_params()
111
- res.update(
112
- {
113
- "delta_size": self.delta_size,
114
- "delta_unit": self.delta_unit,
115
- }
116
- )
117
- return res
118
-
119
111
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
112
+ return ts.apply(self._calculate_delta2).iloc[:, [-1]]
113
+
114
+ def _calculate_delta2(self, x):
120
115
  # Calculate first delta
121
- delta1 = Delta(delta_size=self.delta_size, delta_unit=self.delta_unit)
122
- first_delta = delta1._aggregate(ts)
116
+ first_delta = self._calculate_delta(x)
123
117
 
124
118
  # Calculate delta of delta (second derivative)
125
- return delta1._aggregate(first_delta)
119
+ return self._calculate_delta(first_delta)
@@ -56,6 +56,9 @@ class TrendCoefficient(TimeSeriesBase):
56
56
  x.iloc[:, -1].resample(f"{self.step_size}{self.step_unit}").fillna(method="ffill").fillna(method="bfill")
57
57
  )
58
58
  idx = np.arange(len(resampled))
59
- coeffs = np.polyfit(idx, resampled, 1)
60
- x.iloc[:, -1] = coeffs[0]
59
+ try:
60
+ coeffs = np.polyfit(idx, resampled, 1)
61
+ x.iloc[:, -1] = coeffs[0]
62
+ except np.linalg.LinAlgError:
63
+ x.iloc[:, -1] = 0
61
64
  return x.iloc[:, -1] if return_series else x
@@ -3632,7 +3632,7 @@ if response.status_code == 200:
3632
3632
  )
3633
3633
  do_sorting = False
3634
3634
  else:
3635
- columns_to_hash = list(search_keys.keys()) + renamed_id_columns + [target_name]
3635
+ columns_to_hash = list(set(list(search_keys.keys()) + renamed_id_columns + [target_name]))
3636
3636
  columns_to_hash = sort_columns(
3637
3637
  df[columns_to_hash],
3638
3638
  target_name,
@@ -3,7 +3,6 @@ from collections import namedtuple
3
3
 
4
4
  import numpy as np
5
5
  import numpy.ma as ma
6
- import scipy
7
6
  from joblib import Parallel, delayed
8
7
  from numpy import ndarray
9
8
  from psutil import cpu_count
@@ -116,6 +115,22 @@ def spearmanr(
116
115
  if nan_policy == "omit":
117
116
  x = mask_fn(x)
118
117
 
118
+ # - dof: degrees of freedom
119
+ # - t_stat: t-statistic
120
+ # - alternative: 'two-sided', 'greater', 'less'
121
+ def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
122
+ from scipy.stats import t
123
+
124
+ if alternative == "two-sided":
125
+ prob = 2 * t.sf(abs(t_stat), dof)
126
+ elif alternative == "greater":
127
+ prob = t.sf(t_stat, dof)
128
+ elif alternative == "less":
129
+ prob = t.cdf(t_stat, dof)
130
+ else:
131
+ raise ValueError(f"Unknown alternative: {alternative}")
132
+ return t_stat, prob
133
+
119
134
  def _spearmanr_2cols(x):
120
135
  # Mask the same observations for all variables, and then drop those
121
136
  # observations (can't leave them masked, rankdata is weird).
@@ -142,7 +157,7 @@ def spearmanr(
142
157
  # errors before taking the square root
143
158
  t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
159
 
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
160
+ t, prob = compute_t_pvalue(dof, t, alternative)
146
161
 
147
162
  # For backwards compatibility, return scalars when comparing 2 columns
148
163
  if rs.shape == (2, 2):
@@ -28,12 +28,13 @@ def sort_columns(
28
28
  logger = logging.getLogger(__name__)
29
29
  logger.setLevel(logging.FATAL)
30
30
  df = df.copy() # avoid side effects
31
+ search_keys = {k: v for k, v in search_keys.items() if v != SearchKey.CUSTOM_KEY}
31
32
 
32
33
  # Check multiple search keys
33
34
  search_key_values = list(search_keys.values())
34
35
  has_duplicate_search_keys = len(search_key_values) != len(set(search_key_values))
35
36
  if has_duplicate_search_keys:
36
- logging.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
+ logger.warning(f"WARNING: Found duplicate SearchKey values in search_keys: {search_keys}")
37
38
 
38
39
  sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
39
40
  sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
@@ -68,8 +69,9 @@ def get_sort_columns_dict(
68
69
  if len(string_features) > 0:
69
70
  if len(df) > len(df.drop(columns=string_features).drop_duplicates()) or sort_all_columns:
70
71
  # factorize string features
72
+ df = df.copy()
71
73
  for c in string_features:
72
- df.loc[:, c] = pd.Series(df[c].factorize(sort=True)[0], index=df.index, dtype="int")
74
+ df = df.assign(**{c: pd.factorize(df[c], sort=True)[0].astype(int)})
73
75
  columns_for_sort.extend(string_features)
74
76
 
75
77
  if len(columns_for_sort) == 0:
@@ -1 +0,0 @@
1
- __version__ = "1.2.62a3818.dev4"
File without changes
File without changes
File without changes