upgini 1.2.62a3818.dev3__tar.gz → 1.2.63__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (71) hide show
  1. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/PKG-INFO +2 -1
  2. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/pyproject.toml +1 -0
  3. upgini-1.2.63/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/all_operands.py +2 -2
  5. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/binary.py +11 -11
  6. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/date.py +6 -6
  7. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/feature.py +6 -6
  8. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/groupby.py +6 -6
  9. upgini-1.2.62a3818.dev3/src/upgini/autofe/operator.py → upgini-1.2.63/src/upgini/autofe/operand.py +9 -9
  10. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/unary.py +11 -11
  11. upgini-1.2.62a3818.dev3/src/upgini/autofe/timeseries.py → upgini-1.2.63/src/upgini/autofe/vector.py +26 -6
  12. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/features_enricher.py +1 -1
  13. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/mstats.py +17 -2
  14. upgini-1.2.62a3818.dev3/src/upgini/__about__.py +0 -1
  15. upgini-1.2.62a3818.dev3/src/upgini/autofe/vector.py +0 -24
  16. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/.gitignore +0 -0
  17. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/LICENSE +0 -0
  18. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/README.md +0 -0
  19. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/__init__.py +0 -0
  20. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/ads.py +0 -0
  21. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/ads_management/__init__.py +0 -0
  22. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/ads_management/ads_manager.py +0 -0
  23. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/__init__.py +0 -0
  24. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/dataset.py +0 -0
  27. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/http.py +0 -0
  29. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/metrics.py +0 -0
  34. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/normalizer/normalize_utils.py +0 -0
  36. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  47. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/__init__.py +0 -0
  48. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/base_search_key_detector.py +0 -0
  49. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/blocked_time_series.py +0 -0
  50. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/country_utils.py +0 -0
  51. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/custom_loss_utils.py +0 -0
  52. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/cv_utils.py +0 -0
  53. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/datetime_utils.py +0 -0
  54. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/deduplicate_utils.py +0 -0
  55. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/display_utils.py +0 -0
  56. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/email_utils.py +0 -0
  57. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/fallback_progress_bar.py +0 -0
  58. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/feature_info.py +0 -0
  59. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/features_validator.py +0 -0
  60. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/format.py +0 -0
  61. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/ip_utils.py +0 -0
  62. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/phone_utils.py +0 -0
  63. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/postal_code_utils.py +0 -0
  64. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/progress_bar.py +0 -0
  65. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/sklearn_ext.py +0 -0
  66. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/sort.py +0 -0
  67. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/target_utils.py +0 -0
  68. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/track_info.py +0 -0
  69. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/ts_utils.py +0 -0
  70. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/warning_counter.py +0 -0
  71. {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.62a3818.dev3
3
+ Version: 1.2.63
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -38,6 +38,7 @@ Requires-Dist: python-dateutil>=2.8.0
38
38
  Requires-Dist: python-json-logger>=3.3.0
39
39
  Requires-Dist: requests>=2.8.0
40
40
  Requires-Dist: scikit-learn>=1.3.0
41
+ Requires-Dist: scipy>=1.10.0
41
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
42
43
  Description-Content-Type: text/markdown
43
44
 
@@ -46,6 +46,7 @@ dependencies = [
46
46
  "python-json-logger>=3.3.0",
47
47
  "requests>=2.8.0",
48
48
  "scikit-learn>=1.3.0",
49
+ "scipy>=1.10.0",
49
50
  "python-bidi==0.4.2",
50
51
  "xhtml2pdf>=0.2.11,<0.3.0",
51
52
  "jarowinkler>=2.0.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.63"
@@ -1,4 +1,4 @@
1
- from upgini.autofe.operator import OperatorRegistry
1
+ from upgini.autofe.operand import OperandRegistry
2
2
  from upgini.autofe.unary import * # noqa
3
3
  from upgini.autofe.binary import * # noqa
4
4
  from upgini.autofe.groupby import * # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
7
7
 
8
8
 
9
9
  def find_op(name):
10
- return OperatorRegistry.get_operand(name)
10
+ return OperandRegistry.get_operand(name)
@@ -5,10 +5,10 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from jarowinkler import jarowinkler_similarity
7
7
 
8
- from upgini.autofe.operator import PandasOperator, VectorizableMixin
8
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
9
9
 
10
10
 
11
- class Min(PandasOperator):
11
+ class Min(PandasOperand):
12
12
  name: str = "min"
13
13
  is_binary: bool = True
14
14
  is_symmetrical: bool = True
@@ -18,7 +18,7 @@ class Min(PandasOperator):
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
- class Max(PandasOperator):
21
+ class Max(PandasOperand):
22
22
  name: str = "max"
23
23
  is_binary: bool = True
24
24
  is_symmetrical: bool = True
@@ -28,7 +28,7 @@ class Max(PandasOperator):
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
- class Add(PandasOperator, VectorizableMixin):
31
+ class Add(PandasOperand, VectorizableMixin):
32
32
  name: str = "+"
33
33
  alias: str = "add"
34
34
  is_binary: bool = True
@@ -47,7 +47,7 @@ class Add(PandasOperator, VectorizableMixin):
47
47
  return d1.add(d2, axis=0)
48
48
 
49
49
 
50
- class Subtract(PandasOperator, VectorizableMixin):
50
+ class Subtract(PandasOperand, VectorizableMixin):
51
51
  name: str = "-"
52
52
  alias: str = "sub"
53
53
  is_binary: bool = True
@@ -66,7 +66,7 @@ class Subtract(PandasOperator, VectorizableMixin):
66
66
  return d1.sub(d2, axis=0)
67
67
 
68
68
 
69
- class Multiply(PandasOperator, VectorizableMixin):
69
+ class Multiply(PandasOperand, VectorizableMixin):
70
70
  name: str = "*"
71
71
  alias: str = "mul"
72
72
  is_binary: bool = True
@@ -85,7 +85,7 @@ class Multiply(PandasOperator, VectorizableMixin):
85
85
  return d1.mul(d2, axis=0)
86
86
 
87
87
 
88
- class Divide(PandasOperator, VectorizableMixin):
88
+ class Divide(PandasOperand, VectorizableMixin):
89
89
  name: str = "/"
90
90
  alias: str = "div"
91
91
  is_binary: bool = True
@@ -104,7 +104,7 @@ class Divide(PandasOperator, VectorizableMixin):
104
104
  return d1.div(d2.replace(0, np.nan), axis=0)
105
105
 
106
106
 
107
- class Combine(PandasOperator):
107
+ class Combine(PandasOperand):
108
108
  name: str = "Combine"
109
109
  is_binary: bool = True
110
110
  has_symmetry_importance: bool = True
@@ -116,7 +116,7 @@ class Combine(PandasOperator):
116
116
  return pd.Series(temp, index=left.index)
117
117
 
118
118
 
119
- class CombineThenFreq(PandasOperator):
119
+ class CombineThenFreq(PandasOperand):
120
120
  name: str = "CombineThenFreq"
121
121
  is_binary: bool = True
122
122
  is_symmetrical: bool = True
@@ -132,7 +132,7 @@ class CombineThenFreq(PandasOperator):
132
132
  self._loc(temp, value_counts)
133
133
 
134
134
 
135
- class Distance(PandasOperator):
135
+ class Distance(PandasOperand):
136
136
  name: str = "dist"
137
137
  is_binary: bool = True
138
138
  output_type: Optional[str] = "float"
@@ -170,7 +170,7 @@ class Sim(Distance):
170
170
  return 1 - super().calculate_binary(left, right)
171
171
 
172
172
 
173
- class StringSim(PandasOperator, abc.ABC):
173
+ class StringSim(PandasOperand, abc.ABC):
174
174
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
175
175
  sims = []
176
176
  for i in left.index:
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operator import PandasOperator, ParametrizedOperator
10
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand
11
11
 
12
12
 
13
13
  def get_pydantic_version():
@@ -43,7 +43,7 @@ class DateDiffMixin(BaseModel):
43
43
  raise Exception(f"Unsupported difference unit: {self.diff_unit}")
44
44
 
45
45
 
46
- class DateDiff(PandasOperator, DateDiffMixin):
46
+ class DateDiff(PandasOperand, DateDiffMixin):
47
47
  name: str = "date_diff"
48
48
  alias: Optional[str] = "date_diff_type1"
49
49
  is_binary: bool = True
@@ -78,7 +78,7 @@ class DateDiff(PandasOperator, DateDiffMixin):
78
78
  return x
79
79
 
80
80
 
81
- class DateDiffType2(PandasOperator, DateDiffMixin):
81
+ class DateDiffType2(PandasOperand, DateDiffMixin):
82
82
  name: str = "date_diff_type2"
83
83
  is_binary: bool = True
84
84
  has_symmetry_importance: bool = True
@@ -112,7 +112,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
112
112
  _count_aggregations = ["nunique", "count"]
113
113
 
114
114
 
115
- class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
115
+ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
116
116
  is_binary: bool = True
117
117
  has_symmetry_importance: bool = True
118
118
 
@@ -183,7 +183,7 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
183
183
  return method(x) if len(x) > 0 else default
184
184
 
185
185
 
186
- class DateListDiffBounded(DateListDiff, ParametrizedOperator):
186
+ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
187
187
  lower_bound: Optional[int] = None
188
188
  upper_bound: Optional[int] = None
189
189
 
@@ -217,7 +217,7 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
217
217
  return super()._agg(x)
218
218
 
219
219
 
220
- class DatePercentileBase(PandasOperator, abc.ABC):
220
+ class DatePercentileBase(PandasOperand, abc.ABC):
221
221
  is_binary: bool = True
222
222
  output_type: Optional[str] = "float"
223
223
 
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
9
  from upgini.autofe.all_operands import find_op
10
- from upgini.autofe.operator import Operator, PandasOperator
10
+ from upgini.autofe.operand import Operand, PandasOperand
11
11
 
12
12
 
13
13
  class Column:
@@ -65,7 +65,7 @@ class Column:
65
65
  class Feature:
66
66
  def __init__(
67
67
  self,
68
- op: Operator,
68
+ op: Operand,
69
69
  children: List[Union[Column, "Feature"]],
70
70
  data: Optional[pd.DataFrame] = None,
71
71
  display_index: Optional[str] = None,
@@ -188,7 +188,7 @@ class Feature:
188
188
  return self.children[0].infer_type(data)
189
189
 
190
190
  def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
191
- if isinstance(self.op, PandasOperator):
191
+ if isinstance(self.op, PandasOperand):
192
192
  if self.op.is_vector:
193
193
  ds = [child.calculate(data) for child in self.children]
194
194
  new_data = self.op.calculate(data=ds)
@@ -324,7 +324,7 @@ class Feature:
324
324
 
325
325
  class FeatureGroup:
326
326
  def __init__(
327
- self, op: Operator, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
327
+ self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
328
328
  ):
329
329
  self.op = op
330
330
  self.main_column_node = main_column
@@ -345,7 +345,7 @@ class FeatureGroup:
345
345
  return names
346
346
 
347
347
  def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
348
- if isinstance(self.op, PandasOperator):
348
+ if isinstance(self.op, PandasOperand):
349
349
  main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
350
350
  lower_order_children = []
351
351
  if self.main_column_node is not None:
@@ -378,7 +378,7 @@ class FeatureGroup:
378
378
  def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
379
379
  grouped_features = []
380
380
 
381
- def groupby_func(f: Feature) -> Tuple[Operator, Union[Column, Feature]]:
381
+ def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
382
382
  return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
383
383
 
384
384
  for op_child, features in itertools.groupby(candidates, groupby_func):
@@ -2,13 +2,13 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
5
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
6
6
 
7
7
 
8
8
  class GroupByThenAgg(
9
- PandasOperator,
9
+ PandasOperand,
10
10
  VectorizableMixin,
11
- ParametrizedOperator,
11
+ ParametrizedOperand,
12
12
  ):
13
13
  agg: Optional[str]
14
14
  is_vectorizable: bool = True
@@ -39,7 +39,7 @@ class GroupByThenAgg(
39
39
  return temp.merge(d2, how="right", on=[group_column])[value_columns]
40
40
 
41
41
 
42
- class GroupByThenRank(PandasOperator, VectorizableMixin):
42
+ class GroupByThenRank(PandasOperand, VectorizableMixin):
43
43
  name: str = "GroupByThenRank"
44
44
  is_vectorizable: bool = True
45
45
  is_grouping: bool = True
@@ -58,7 +58,7 @@ class GroupByThenRank(PandasOperator, VectorizableMixin):
58
58
  return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
59
59
 
60
60
 
61
- class GroupByThenNUnique(PandasOperator, VectorizableMixin):
61
+ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
62
62
  name: str = "GroupByThenNUnique"
63
63
  is_vectorizable: bool = True
64
64
  is_grouping: bool = True
@@ -78,7 +78,7 @@ class GroupByThenNUnique(PandasOperator, VectorizableMixin):
78
78
  return nunique.merge(d2, how="right", on=[group_column])[value_columns]
79
79
 
80
80
 
81
- class GroupByThenFreq(PandasOperator):
81
+ class GroupByThenFreq(PandasOperand):
82
82
  name: str = "GroupByThenFreq"
83
83
  is_grouping: bool = True
84
84
  output_type: Optional[str] = "float"
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class OperatorRegistry(type(BaseModel)):
9
+ class OperandRegistry(type(BaseModel)):
10
10
  _registry = {}
11
11
  _parametrized_registry = []
12
12
 
@@ -20,9 +20,9 @@ class OperatorRegistry(type(BaseModel)):
20
20
  base_names.update(b.__name__ for b in base.__bases__)
21
21
  base_classes.extend(base.__bases__)
22
22
 
23
- if "Operator" in base_names:
23
+ if "Operand" in base_names:
24
24
  # Track parametrized operands separately
25
- if "ParametrizedOperator" in base_names:
25
+ if "ParametrizedOperand" in base_names:
26
26
  cls._parametrized_registry.append(new_class)
27
27
  else:
28
28
  try:
@@ -33,7 +33,7 @@ class OperatorRegistry(type(BaseModel)):
33
33
  return new_class
34
34
 
35
35
  @classmethod
36
- def get_operand(cls, name: str) -> Optional["Operator"]:
36
+ def get_operand(cls, name: str) -> Optional["Operand"]:
37
37
  # First try to resolve as a parametrized operand formula
38
38
  for operand_cls in cls._parametrized_registry:
39
39
  resolved = operand_cls.from_formula(name)
@@ -46,7 +46,7 @@ class OperatorRegistry(type(BaseModel)):
46
46
  return None
47
47
 
48
48
 
49
- class Operator(BaseModel, metaclass=OperatorRegistry):
49
+ class Operand(BaseModel, metaclass=OperandRegistry):
50
50
  name: Optional[str] = None
51
51
  alias: Optional[str] = None
52
52
  is_unary: bool = False
@@ -75,7 +75,7 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
75
75
  return self.name
76
76
 
77
77
 
78
- class ParametrizedOperator(Operator, abc.ABC):
78
+ class ParametrizedOperand(Operand, abc.ABC):
79
79
 
80
80
  @abc.abstractmethod
81
81
  def to_formula(self) -> str:
@@ -83,14 +83,14 @@ class ParametrizedOperator(Operator, abc.ABC):
83
83
 
84
84
  @classmethod
85
85
  @abc.abstractmethod
86
- def from_formula(cls, formula: str) -> Optional["Operator"]:
86
+ def from_formula(cls, formula: str) -> Optional["Operand"]:
87
87
  pass
88
88
 
89
89
 
90
90
  MAIN_COLUMN = "main_column"
91
91
 
92
92
 
93
- class PandasOperator(Operator, abc.ABC):
93
+ class PandasOperand(Operand, abc.ABC):
94
94
  def calculate(self, **kwargs) -> pd.Series:
95
95
  if self.is_unary:
96
96
  return self.calculate_unary(kwargs["data"])
@@ -131,7 +131,7 @@ class PandasOperator(Operator, abc.ABC):
131
131
  return value
132
132
 
133
133
 
134
- class VectorizableMixin(Operator):
134
+ class VectorizableMixin(Operand):
135
135
  group_index: int = 1
136
136
 
137
137
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
@@ -2,10 +2,10 @@ from typing import Dict, Optional
2
2
  import numpy as np
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operator import PandasOperator, VectorizableMixin
5
+ from upgini.autofe.operand import PandasOperand, VectorizableMixin
6
6
 
7
7
 
8
- class Abs(PandasOperator, VectorizableMixin):
8
+ class Abs(PandasOperand, VectorizableMixin):
9
9
  name: str = "abs"
10
10
  is_unary: bool = True
11
11
  is_vectorizable: bool = True
@@ -20,7 +20,7 @@ class Abs(PandasOperator, VectorizableMixin):
20
20
  # return data.abs()
21
21
 
22
22
 
23
- class Log(PandasOperator, VectorizableMixin):
23
+ class Log(PandasOperand, VectorizableMixin):
24
24
  name: str = "log"
25
25
  is_unary: bool = True
26
26
  is_vectorizable: bool = True
@@ -34,7 +34,7 @@ class Log(PandasOperator, VectorizableMixin):
34
34
  return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
35
35
 
36
36
 
37
- class Sqrt(PandasOperator, VectorizableMixin):
37
+ class Sqrt(PandasOperand, VectorizableMixin):
38
38
  name: str = "sqrt"
39
39
  is_unary: bool = True
40
40
  is_vectorizable: bool = True
@@ -48,7 +48,7 @@ class Sqrt(PandasOperator, VectorizableMixin):
48
48
  return self._round_value(np.sqrt(data.abs()))
49
49
 
50
50
 
51
- class Square(PandasOperator, VectorizableMixin):
51
+ class Square(PandasOperand, VectorizableMixin):
52
52
  name: str = "square"
53
53
  is_unary: bool = True
54
54
  is_vectorizable: bool = True
@@ -61,7 +61,7 @@ class Square(PandasOperator, VectorizableMixin):
61
61
  return np.square(data)
62
62
 
63
63
 
64
- class Sigmoid(PandasOperator, VectorizableMixin):
64
+ class Sigmoid(PandasOperand, VectorizableMixin):
65
65
  name: str = "sigmoid"
66
66
  is_unary: bool = True
67
67
  is_vectorizable: bool = True
@@ -75,7 +75,7 @@ class Sigmoid(PandasOperator, VectorizableMixin):
75
75
  return self._round_value(1 / (1 + np.exp(-data)))
76
76
 
77
77
 
78
- class Floor(PandasOperator, VectorizableMixin):
78
+ class Floor(PandasOperand, VectorizableMixin):
79
79
  name: str = "floor"
80
80
  is_unary: bool = True
81
81
  is_vectorizable: bool = True
@@ -90,7 +90,7 @@ class Floor(PandasOperator, VectorizableMixin):
90
90
  return np.floor(data)
91
91
 
92
92
 
93
- class Residual(PandasOperator, VectorizableMixin):
93
+ class Residual(PandasOperand, VectorizableMixin):
94
94
  name: str = "residual"
95
95
  is_unary: bool = True
96
96
  is_vectorizable: bool = True
@@ -104,7 +104,7 @@ class Residual(PandasOperator, VectorizableMixin):
104
104
  return data - np.floor(data)
105
105
 
106
106
 
107
- class Freq(PandasOperator):
107
+ class Freq(PandasOperand):
108
108
  name: str = "freq"
109
109
  is_unary: bool = True
110
110
  output_type: Optional[str] = "float"
@@ -116,7 +116,7 @@ class Freq(PandasOperator):
116
116
  return self._loc(data, value_counts)
117
117
 
118
118
 
119
- class Norm(PandasOperator):
119
+ class Norm(PandasOperand):
120
120
  name: str = "norm"
121
121
  is_unary: bool = True
122
122
  output_type: Optional[str] = "float"
@@ -148,7 +148,7 @@ class Norm(PandasOperator):
148
148
  return res
149
149
 
150
150
 
151
- class Embeddings(PandasOperator):
151
+ class Embeddings(PandasOperand):
152
152
  name: str = "emb"
153
153
  is_unary: bool = True
154
154
  input_type: Optional[str] = "string"
@@ -2,15 +2,35 @@ import abc
2
2
  from typing import Dict, List, Optional
3
3
 
4
4
  import pandas as pd
5
- from upgini.autofe.operator import PandasOperator, ParametrizedOperator
6
5
 
7
6
  try:
8
7
  from pydantic import field_validator as validator # V2
9
8
  except ImportError:
10
9
  from pydantic import validator # V1
11
10
 
11
+ from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
12
12
 
13
- class TimeSeriesBase(PandasOperator, abc.ABC):
13
+
14
+ class Mean(PandasOperand, VectorizableMixin):
15
+ name: str = "mean"
16
+ output_type: Optional[str] = "float"
17
+ is_vector: bool = True
18
+ group_index: int = 0
19
+
20
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
21
+ return pd.DataFrame(data).T.fillna(0).mean(axis=1)
22
+
23
+
24
+ class Sum(PandasOperand, VectorizableMixin):
25
+ name: str = "sum"
26
+ is_vector: bool = True
27
+ group_index: int = 0
28
+
29
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
30
+ return pd.DataFrame(data).T.fillna(0).sum(axis=1)
31
+
32
+
33
+ class TimeSeriesBase(PandasOperand, abc.ABC):
14
34
  is_vector: bool = True
15
35
  date_unit: Optional[str] = None
16
36
  offset_size: int = 0
@@ -35,7 +55,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
35
55
  ts.set_index(date.name, inplace=True)
36
56
  ts = ts[ts.index.notna()].sort_index()
37
57
  ts = (
38
- ts.groupby([c.name for c in data[1:-1]], group_keys=True)
58
+ ts.groupby([c.name for c in data[1:-1]])
39
59
  .apply(self._shift)[data[-1].name]
40
60
  .to_frame()
41
61
  .reset_index()
@@ -64,10 +84,10 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
64
84
  pass
65
85
 
66
86
 
67
- _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean(), "last": lambda x: x[-1]}
87
+ _roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
68
88
 
69
89
 
70
- class Roll(TimeSeriesBase, ParametrizedOperator):
90
+ class Roll(TimeSeriesBase, ParametrizedOperand):
71
91
  aggregation: str
72
92
  window_size: int = 1
73
93
  window_unit: str = "D"
@@ -142,7 +162,7 @@ class Roll(TimeSeriesBase, ParametrizedOperator):
142
162
  )
143
163
 
144
164
 
145
- class Lag(TimeSeriesBase, ParametrizedOperator):
165
+ class Lag(TimeSeriesBase, ParametrizedOperand):
146
166
  lag_size: int
147
167
  lag_unit: str = "D"
148
168
 
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
31
31
  from sklearn.model_selection import BaseCrossValidator
32
32
 
33
33
  from upgini.autofe.feature import Feature
34
- from upgini.autofe.timeseries import TimeSeriesBase
34
+ from upgini.autofe.vector import TimeSeriesBase
35
35
  from upgini.data_source.data_source_publisher import CommercialSchema
36
36
  from upgini.dataset import Dataset
37
37
  from upgini.errors import HttpError, ValidationError
@@ -3,7 +3,6 @@ from collections import namedtuple
3
3
 
4
4
  import numpy as np
5
5
  import numpy.ma as ma
6
- import scipy
7
6
  from joblib import Parallel, delayed
8
7
  from numpy import ndarray
9
8
  from psutil import cpu_count
@@ -116,6 +115,22 @@ def spearmanr(
116
115
  if nan_policy == "omit":
117
116
  x = mask_fn(x)
118
117
 
118
+ # - dof: degrees of freedom
119
+ # - t_stat: t-statistic
120
+ # - alternative: 'two-sided', 'greater', 'less'
121
+ def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
122
+ from scipy.stats import t
123
+
124
+ if alternative == "two-sided":
125
+ prob = 2 * t.sf(abs(t_stat), dof)
126
+ elif alternative == "greater":
127
+ prob = t.sf(t_stat, dof)
128
+ elif alternative == "less":
129
+ prob = t.cdf(t_stat, dof)
130
+ else:
131
+ raise ValueError(f"Unknown alternative: {alternative}")
132
+ return t_stat, prob
133
+
119
134
  def _spearmanr_2cols(x):
120
135
  # Mask the same observations for all variables, and then drop those
121
136
  # observations (can't leave them masked, rankdata is weird).
@@ -142,7 +157,7 @@ def spearmanr(
142
157
  # errors before taking the square root
143
158
  t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
144
159
 
145
- t, prob = scipy.stats._mstats_basic._ttest_finish(dof, t, alternative)
160
+ t, prob = compute_t_pvalue(dof, t, alternative)
146
161
 
147
162
  # For backwards compatibility, return scalars when comparing 2 columns
148
163
  if rs.shape == (2, 2):
@@ -1 +0,0 @@
1
- __version__ = "1.2.62a3818.dev3"
@@ -1,24 +0,0 @@
1
- from typing import List, Optional
2
-
3
- import pandas as pd
4
-
5
- from upgini.autofe.operator import PandasOperator, VectorizableMixin
6
-
7
-
8
- class Mean(PandasOperator, VectorizableMixin):
9
- name: str = "mean"
10
- output_type: Optional[str] = "float"
11
- is_vector: bool = True
12
- group_index: int = 0
13
-
14
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
15
- return pd.DataFrame(data).T.fillna(0).mean(axis=1)
16
-
17
-
18
- class Sum(PandasOperator, VectorizableMixin):
19
- name: str = "sum"
20
- is_vector: bool = True
21
- group_index: int = 0
22
-
23
- def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
24
- return pd.DataFrame(data).T.fillna(0).sum(axis=1)
File without changes
File without changes
File without changes