upgini 1.2.63__tar.gz → 1.2.65a3818.dev5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/PKG-INFO +1 -1
  2. upgini-1.2.65a3818.dev5/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/all_operands.py +2 -2
  4. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/binary.py +11 -11
  5. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/date.py +6 -6
  6. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/feature.py +6 -6
  7. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/groupby.py +6 -6
  8. upgini-1.2.63/src/upgini/autofe/operand.py → upgini-1.2.65a3818.dev5/src/upgini/autofe/operator.py +13 -11
  9. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/__init__.py +23 -0
  10. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/base.py +105 -0
  11. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/cross.py +130 -0
  12. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/delta.py +119 -0
  13. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/lag.py +68 -0
  14. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/roll.py +92 -0
  15. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/trend.py +61 -0
  16. upgini-1.2.65a3818.dev5/src/upgini/autofe/timeseries/volatility.py +259 -0
  17. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/unary.py +11 -11
  18. upgini-1.2.65a3818.dev5/src/upgini/autofe/vector.py +24 -0
  19. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/features_enricher.py +2 -2
  20. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/sort.py +4 -2
  21. upgini-1.2.63/src/upgini/__about__.py +0 -1
  22. upgini-1.2.63/src/upgini/autofe/vector.py +0 -220
  23. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/.gitignore +0 -0
  24. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/LICENSE +0 -0
  25. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/README.md +0 -0
  26. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/pyproject.toml +0 -0
  27. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/__init__.py +0 -0
  28. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/ads.py +0 -0
  29. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/ads_management/__init__.py +0 -0
  30. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/ads_management/ads_manager.py +0 -0
  31. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/autofe/__init__.py +0 -0
  32. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/http.py +0 -0
  37. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/lazy_import.py +0 -0
  38. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/mdc/__init__.py +0 -0
  39. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/mdc/context.py +0 -0
  40. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/metadata.py +0 -0
  41. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/metrics.py +0 -0
  42. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/normalizer/__init__.py +0 -0
  43. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/normalizer/normalize_utils.py +0 -0
  44. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/__init__.py +0 -0
  45. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/exceptions.py +0 -0
  46. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/strings.properties +0 -0
  47. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/country_utils.py +0 -0
  59. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/custom_loss_utils.py +0 -0
  60. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/cv_utils.py +0 -0
  61. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/datetime_utils.py +0 -0
  62. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/deduplicate_utils.py +0 -0
  63. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/display_utils.py +0 -0
  64. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/email_utils.py +0 -0
  65. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/fallback_progress_bar.py +0 -0
  66. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/feature_info.py +0 -0
  67. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/sklearn_ext.py +0 -0
  75. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/target_utils.py +0 -0
  76. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/track_info.py +0 -0
  77. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/ts_utils.py +0 -0
  78. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/utils/warning_counter.py +0 -0
  79. {upgini-1.2.63 → upgini-1.2.65a3818.dev5}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.63
3
+ Version: 1.2.65a3818.dev5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.65a3818.dev5"
@@ -1,4 +1,4 @@
1
- from upgini.autofe.operand import OperandRegistry
1
+ from upgini.autofe.operator import OperatorRegistry
2
2
  from upgini.autofe.unary import * # noqa
3
3
  from upgini.autofe.binary import * # noqa
4
4
  from upgini.autofe.groupby import * # noqa
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
7
7
 
8
8
 
9
9
  def find_op(name):
10
- return OperandRegistry.get_operand(name)
10
+ return OperatorRegistry.get_operator(name)
@@ -5,10 +5,10 @@ import numpy as np
5
5
  import pandas as pd
6
6
  from jarowinkler import jarowinkler_similarity
7
7
 
8
- from upgini.autofe.operand import PandasOperand, VectorizableMixin
8
+ from upgini.autofe.operator import PandasOperator, VectorizableMixin
9
9
 
10
10
 
11
- class Min(PandasOperand):
11
+ class Min(PandasOperator):
12
12
  name: str = "min"
13
13
  is_binary: bool = True
14
14
  is_symmetrical: bool = True
@@ -18,7 +18,7 @@ class Min(PandasOperand):
18
18
  return np.minimum(left, right)
19
19
 
20
20
 
21
- class Max(PandasOperand):
21
+ class Max(PandasOperator):
22
22
  name: str = "max"
23
23
  is_binary: bool = True
24
24
  is_symmetrical: bool = True
@@ -28,7 +28,7 @@ class Max(PandasOperand):
28
28
  return np.maximum(left, right)
29
29
 
30
30
 
31
- class Add(PandasOperand, VectorizableMixin):
31
+ class Add(PandasOperator, VectorizableMixin):
32
32
  name: str = "+"
33
33
  alias: str = "add"
34
34
  is_binary: bool = True
@@ -47,7 +47,7 @@ class Add(PandasOperand, VectorizableMixin):
47
47
  return d1.add(d2, axis=0)
48
48
 
49
49
 
50
- class Subtract(PandasOperand, VectorizableMixin):
50
+ class Subtract(PandasOperator, VectorizableMixin):
51
51
  name: str = "-"
52
52
  alias: str = "sub"
53
53
  is_binary: bool = True
@@ -66,7 +66,7 @@ class Subtract(PandasOperand, VectorizableMixin):
66
66
  return d1.sub(d2, axis=0)
67
67
 
68
68
 
69
- class Multiply(PandasOperand, VectorizableMixin):
69
+ class Multiply(PandasOperator, VectorizableMixin):
70
70
  name: str = "*"
71
71
  alias: str = "mul"
72
72
  is_binary: bool = True
@@ -85,7 +85,7 @@ class Multiply(PandasOperand, VectorizableMixin):
85
85
  return d1.mul(d2, axis=0)
86
86
 
87
87
 
88
- class Divide(PandasOperand, VectorizableMixin):
88
+ class Divide(PandasOperator, VectorizableMixin):
89
89
  name: str = "/"
90
90
  alias: str = "div"
91
91
  is_binary: bool = True
@@ -104,7 +104,7 @@ class Divide(PandasOperand, VectorizableMixin):
104
104
  return d1.div(d2.replace(0, np.nan), axis=0)
105
105
 
106
106
 
107
- class Combine(PandasOperand):
107
+ class Combine(PandasOperator):
108
108
  name: str = "Combine"
109
109
  is_binary: bool = True
110
110
  has_symmetry_importance: bool = True
@@ -116,7 +116,7 @@ class Combine(PandasOperand):
116
116
  return pd.Series(temp, index=left.index)
117
117
 
118
118
 
119
- class CombineThenFreq(PandasOperand):
119
+ class CombineThenFreq(PandasOperator):
120
120
  name: str = "CombineThenFreq"
121
121
  is_binary: bool = True
122
122
  is_symmetrical: bool = True
@@ -132,7 +132,7 @@ class CombineThenFreq(PandasOperand):
132
132
  self._loc(temp, value_counts)
133
133
 
134
134
 
135
- class Distance(PandasOperand):
135
+ class Distance(PandasOperator):
136
136
  name: str = "dist"
137
137
  is_binary: bool = True
138
138
  output_type: Optional[str] = "float"
@@ -170,7 +170,7 @@ class Sim(Distance):
170
170
  return 1 - super().calculate_binary(left, right)
171
171
 
172
172
 
173
- class StringSim(PandasOperand, abc.ABC):
173
+ class StringSim(PandasOperator, abc.ABC):
174
174
  def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
175
175
  sims = []
176
176
  for i in left.index:
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas.core.arrays.timedeltas import TimedeltaArray
8
8
  from pydantic import BaseModel, __version__ as pydantic_version
9
9
 
10
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand
10
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator
11
11
 
12
12
 
13
13
  def get_pydantic_version():
@@ -43,7 +43,7 @@ class DateDiffMixin(BaseModel):
43
43
  raise Exception(f"Unsupported difference unit: {self.diff_unit}")
44
44
 
45
45
 
46
- class DateDiff(PandasOperand, DateDiffMixin):
46
+ class DateDiff(PandasOperator, DateDiffMixin):
47
47
  name: str = "date_diff"
48
48
  alias: Optional[str] = "date_diff_type1"
49
49
  is_binary: bool = True
@@ -78,7 +78,7 @@ class DateDiff(PandasOperand, DateDiffMixin):
78
78
  return x
79
79
 
80
80
 
81
- class DateDiffType2(PandasOperand, DateDiffMixin):
81
+ class DateDiffType2(PandasOperator, DateDiffMixin):
82
82
  name: str = "date_diff_type2"
83
83
  is_binary: bool = True
84
84
  has_symmetry_importance: bool = True
@@ -112,7 +112,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
112
112
  _count_aggregations = ["nunique", "count"]
113
113
 
114
114
 
115
- class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
115
+ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
116
116
  is_binary: bool = True
117
117
  has_symmetry_importance: bool = True
118
118
 
@@ -183,7 +183,7 @@ class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
183
183
  return method(x) if len(x) > 0 else default
184
184
 
185
185
 
186
- class DateListDiffBounded(DateListDiff, ParametrizedOperand):
186
+ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
187
187
  lower_bound: Optional[int] = None
188
188
  upper_bound: Optional[int] = None
189
189
 
@@ -217,7 +217,7 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperand):
217
217
  return super()._agg(x)
218
218
 
219
219
 
220
- class DatePercentileBase(PandasOperand, abc.ABC):
220
+ class DatePercentileBase(PandasOperator, abc.ABC):
221
221
  is_binary: bool = True
222
222
  output_type: Optional[str] = "float"
223
223
 
@@ -7,7 +7,7 @@ import pandas as pd
7
7
  from pandas._typing import DtypeObj
8
8
 
9
9
  from upgini.autofe.all_operands import find_op
10
- from upgini.autofe.operand import Operand, PandasOperand
10
+ from upgini.autofe.operator import Operator, PandasOperator
11
11
 
12
12
 
13
13
  class Column:
@@ -65,7 +65,7 @@ class Column:
65
65
  class Feature:
66
66
  def __init__(
67
67
  self,
68
- op: Operand,
68
+ op: Operator,
69
69
  children: List[Union[Column, "Feature"]],
70
70
  data: Optional[pd.DataFrame] = None,
71
71
  display_index: Optional[str] = None,
@@ -188,7 +188,7 @@ class Feature:
188
188
  return self.children[0].infer_type(data)
189
189
 
190
190
  def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
191
- if isinstance(self.op, PandasOperand):
191
+ if isinstance(self.op, PandasOperator):
192
192
  if self.op.is_vector:
193
193
  ds = [child.calculate(data) for child in self.children]
194
194
  new_data = self.op.calculate(data=ds)
@@ -324,7 +324,7 @@ class Feature:
324
324
 
325
325
  class FeatureGroup:
326
326
  def __init__(
327
- self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
327
+ self, op: Operator, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
328
328
  ):
329
329
  self.op = op
330
330
  self.main_column_node = main_column
@@ -345,7 +345,7 @@ class FeatureGroup:
345
345
  return names
346
346
 
347
347
  def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
348
- if isinstance(self.op, PandasOperand):
348
+ if isinstance(self.op, PandasOperator):
349
349
  main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
350
350
  lower_order_children = []
351
351
  if self.main_column_node is not None:
@@ -378,7 +378,7 @@ class FeatureGroup:
378
378
  def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
379
379
  grouped_features = []
380
380
 
381
- def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
381
+ def groupby_func(f: Feature) -> Tuple[Operator, Union[Column, Feature]]:
382
382
  return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
383
383
 
384
384
  for op_child, features in itertools.groupby(candidates, groupby_func):
@@ -2,13 +2,13 @@ from typing import Optional
2
2
 
3
3
  import pandas as pd
4
4
 
5
- from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
5
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator, VectorizableMixin
6
6
 
7
7
 
8
8
  class GroupByThenAgg(
9
- PandasOperand,
9
+ PandasOperator,
10
10
  VectorizableMixin,
11
- ParametrizedOperand,
11
+ ParametrizedOperator,
12
12
  ):
13
13
  agg: Optional[str]
14
14
  is_vectorizable: bool = True
@@ -39,7 +39,7 @@ class GroupByThenAgg(
39
39
  return temp.merge(d2, how="right", on=[group_column])[value_columns]
40
40
 
41
41
 
42
- class GroupByThenRank(PandasOperand, VectorizableMixin):
42
+ class GroupByThenRank(PandasOperator, VectorizableMixin):
43
43
  name: str = "GroupByThenRank"
44
44
  is_vectorizable: bool = True
45
45
  is_grouping: bool = True
@@ -58,7 +58,7 @@ class GroupByThenRank(PandasOperand, VectorizableMixin):
58
58
  return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
59
59
 
60
60
 
61
- class GroupByThenNUnique(PandasOperand, VectorizableMixin):
61
+ class GroupByThenNUnique(PandasOperator, VectorizableMixin):
62
62
  name: str = "GroupByThenNUnique"
63
63
  is_vectorizable: bool = True
64
64
  is_grouping: bool = True
@@ -78,7 +78,7 @@ class GroupByThenNUnique(PandasOperand, VectorizableMixin):
78
78
  return nunique.merge(d2, how="right", on=[group_column])[value_columns]
79
79
 
80
80
 
81
- class GroupByThenFreq(PandasOperand):
81
+ class GroupByThenFreq(PandasOperator):
82
82
  name: str = "GroupByThenFreq"
83
83
  is_grouping: bool = True
84
84
  output_type: Optional[str] = "float"
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pydantic import BaseModel
7
7
 
8
8
 
9
- class OperandRegistry(type(BaseModel)):
9
+ class OperatorRegistry(type(BaseModel)):
10
10
  _registry = {}
11
11
  _parametrized_registry = []
12
12
 
@@ -20,23 +20,25 @@ class OperandRegistry(type(BaseModel)):
20
20
  base_names.update(b.__name__ for b in base.__bases__)
21
21
  base_classes.extend(base.__bases__)
22
22
 
23
- if "Operand" in base_names:
23
+ if "Operator" in base_names:
24
24
  # Track parametrized operands separately
25
- if "ParametrizedOperand" in base_names:
25
+ if "ParametrizedOperator" in base_names:
26
26
  cls._parametrized_registry.append(new_class)
27
27
  else:
28
28
  try:
29
29
  instance = new_class()
30
30
  cls._registry[instance.name] = new_class
31
+ if instance.alias:
32
+ cls._registry[instance.alias] = new_class
31
33
  except Exception:
32
34
  pass
33
35
  return new_class
34
36
 
35
37
  @classmethod
36
- def get_operand(cls, name: str) -> Optional["Operand"]:
38
+ def get_operator(cls, name: str) -> Optional["Operator"]:
37
39
  # First try to resolve as a parametrized operand formula
38
- for operand_cls in cls._parametrized_registry:
39
- resolved = operand_cls.from_formula(name)
40
+ for operator_cls in cls._parametrized_registry:
41
+ resolved = operator_cls.from_formula(name)
40
42
  if resolved is not None:
41
43
  return resolved
42
44
  # Fall back to direct registry lookup
@@ -46,7 +48,7 @@ class OperandRegistry(type(BaseModel)):
46
48
  return None
47
49
 
48
50
 
49
- class Operand(BaseModel, metaclass=OperandRegistry):
51
+ class Operator(BaseModel, metaclass=OperatorRegistry):
50
52
  name: Optional[str] = None
51
53
  alias: Optional[str] = None
52
54
  is_unary: bool = False
@@ -75,7 +77,7 @@ class Operand(BaseModel, metaclass=OperandRegistry):
75
77
  return self.name
76
78
 
77
79
 
78
- class ParametrizedOperand(Operand, abc.ABC):
80
+ class ParametrizedOperator(Operator, abc.ABC):
79
81
 
80
82
  @abc.abstractmethod
81
83
  def to_formula(self) -> str:
@@ -83,14 +85,14 @@ class ParametrizedOperand(Operand, abc.ABC):
83
85
 
84
86
  @classmethod
85
87
  @abc.abstractmethod
86
- def from_formula(cls, formula: str) -> Optional["Operand"]:
88
+ def from_formula(cls, formula: str) -> Optional["Operator"]:
87
89
  pass
88
90
 
89
91
 
90
92
  MAIN_COLUMN = "main_column"
91
93
 
92
94
 
93
- class PandasOperand(Operand, abc.ABC):
95
+ class PandasOperator(Operator, abc.ABC):
94
96
  def calculate(self, **kwargs) -> pd.Series:
95
97
  if self.is_unary:
96
98
  return self.calculate_unary(kwargs["data"])
@@ -131,7 +133,7 @@ class PandasOperand(Operand, abc.ABC):
131
133
  return value
132
134
 
133
135
 
134
- class VectorizableMixin(Operand):
136
+ class VectorizableMixin(Operator):
135
137
  group_index: int = 1
136
138
 
137
139
  def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
@@ -0,0 +1,23 @@
1
+ """Time series feature engineering operators."""
2
+
3
+ from upgini.autofe.timeseries.base import TimeSeriesBase
4
+ from upgini.autofe.timeseries.roll import Roll
5
+ from upgini.autofe.timeseries.lag import Lag
6
+ from upgini.autofe.timeseries.delta import Delta, Delta2
7
+ from upgini.autofe.timeseries.trend import TrendCoefficient
8
+ from upgini.autofe.timeseries.volatility import EWMAVolatility, RollingVolatility, RollingVolatility2, VolatilityRatio
9
+ from upgini.autofe.timeseries.cross import CrossSeriesInteraction
10
+
11
+ __all__ = [
12
+ "TimeSeriesBase",
13
+ "Roll",
14
+ "Lag",
15
+ "Delta",
16
+ "Delta2",
17
+ "TrendCoefficient",
18
+ "EWMAVolatility",
19
+ "RollingVolatility",
20
+ "RollingVolatility2",
21
+ "VolatilityRatio",
22
+ "CrossSeriesInteraction",
23
+ ]
@@ -0,0 +1,105 @@
1
+ import abc
2
+ from typing import Dict, List, Optional
3
+
4
+ import pandas as pd
5
+ from upgini.autofe.operator import PandasOperator
6
+
7
+ # Used in derived classes
8
+ try:
9
+ from pydantic import field_validator as validator # V2
10
+ except ImportError:
11
+ from pydantic import validator # V1
12
+
13
+
14
+ class TimeSeriesBase(PandasOperator, abc.ABC):
15
+ is_vector: bool = True
16
+ date_unit: Optional[str] = None
17
+ offset_size: int = 0
18
+ offset_unit: str = "D"
19
+
20
+ def get_params(self) -> Dict[str, Optional[str]]:
21
+ res = super().get_params()
22
+ res.update(
23
+ {
24
+ "date_unit": self.date_unit,
25
+ "offset_size": self.offset_size,
26
+ "offset_unit": self.offset_unit,
27
+ }
28
+ )
29
+ return res
30
+
31
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
32
+ # assuming first is date, last is value, rest is group columns
33
+ date = pd.to_datetime(data[0], unit=self.date_unit, errors="coerce")
34
+ ts = pd.concat([date] + data[1:], axis=1)
35
+ ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
36
+ ts.set_index(date.name, inplace=True)
37
+ ts = ts[ts.index.notna()].sort_index()
38
+ ts = (
39
+ ts.groupby([c.name for c in data[1:-1]], group_keys=True)
40
+ .apply(self._shift)[data[-1].name]
41
+ .to_frame()
42
+ .reset_index()
43
+ .set_index(date.name)
44
+ .groupby([c.name for c in data[1:-1]], group_keys=True)
45
+ if len(data) > 2
46
+ else self._shift(ts)
47
+ )
48
+ ts = self._aggregate(ts)
49
+ ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
50
+ ts.index = date.index
51
+
52
+ return ts.iloc[:, -1]
53
+
54
+ def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
55
+ if self.offset_size > 0:
56
+ return ts.iloc[:, :-1].merge(
57
+ ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
58
+ left_index=True,
59
+ right_index=True,
60
+ )
61
+ return ts
62
+
63
+ @abc.abstractmethod
64
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
65
+ pass
66
+
67
+ def _add_offset_to_formula(self, base_formula: str) -> str:
68
+ if self.offset_size > 0:
69
+ return f"{base_formula}_offset_{self.offset_size}{self.offset_unit}"
70
+ return base_formula
71
+
72
+ @classmethod
73
+ def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
74
+ """
75
+ Parse the offset component from a formula.
76
+
77
+ Args:
78
+ formula: The formula to parse
79
+ base_regex: The regex pattern for the base formula (without offset)
80
+
81
+ Returns:
82
+ A tuple with:
83
+ - Dictionary with offset parameters if found, None otherwise
84
+ - Remaining part of the formula after removing offset component (for further parsing)
85
+ """
86
+ import re
87
+
88
+ offset_regex = f"{base_regex}_offset_(\\d+)([a-zA-Z])"
89
+ match = re.match(offset_regex, formula)
90
+
91
+ if match:
92
+ # Get groups from the offset part
93
+ offset_size = int(match.group(match.lastindex - 1))
94
+ offset_unit = match.group(match.lastindex)
95
+
96
+ # Return the parameters and the base formula for further parsing if needed
97
+ # Extract the base formula by using the match object
98
+ base_formula = formula[: match.start(match.lastindex - 1) - len("_offset_")]
99
+ return {"offset_size": offset_size, "offset_unit": offset_unit}, base_formula
100
+
101
+ # Check if it matches the base regex (no offset)
102
+ if re.match(f"^{base_regex}$", formula) or re.match(f"^{base_regex}_", formula):
103
+ return None, formula
104
+
105
+ return None, None
@@ -0,0 +1,130 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+
6
+ try:
7
+ from pydantic import field_validator as validator # V2
8
+ except ImportError:
9
+ from pydantic import validator # V1
10
+
11
+ from upgini.autofe.all_operands import find_op
12
+ from upgini.autofe.operator import PandasOperator, ParametrizedOperator
13
+ from upgini.autofe.timeseries.base import TimeSeriesBase
14
+
15
+
16
+ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
17
+ base_name: str = "cross"
18
+ interaction_op: PandasOperator
19
+ descriptor_indices: List[int] = []
20
+ left_descriptor: List[str] = []
21
+ right_descriptor: List[str] = []
22
+
23
+ @validator("descriptor_indices")
24
+ @classmethod
25
+ def validate_descriptor_indices(cls, v):
26
+ if not v:
27
+ raise ValueError("descriptor_indices cannot be empty for CrossSeriesInteraction")
28
+ return v
29
+
30
+ def __init__(self, **data):
31
+ super().__init__(**data)
32
+ indices = self.descriptor_indices
33
+ left = self.left_descriptor
34
+ right = self.right_descriptor
35
+
36
+ if len(left) != len(indices):
37
+ raise ValueError(
38
+ f"left_descriptor length ({len(left)}) " f"must match descriptor_indices length ({len(indices)})"
39
+ )
40
+
41
+ if len(right) != len(indices):
42
+ raise ValueError(
43
+ f"right_descriptor length ({len(right)}) " f"must match descriptor_indices length ({len(indices)})"
44
+ )
45
+
46
+ def to_formula(self) -> str:
47
+ base_formula = f"{self.base_name}_{self._get_interaction_op_name()}"
48
+ return self._add_offset_to_formula(base_formula)
49
+
50
+ @classmethod
51
+ def from_formula(cls, formula: str) -> Optional["CrossSeriesInteraction"]:
52
+ base_regex = r"cross_(.+)"
53
+
54
+ offset_params, remaining_formula = cls._parse_offset_from_formula(formula, base_regex)
55
+
56
+ if remaining_formula is None:
57
+ return None
58
+
59
+ import re
60
+
61
+ match = re.match(f"^{base_regex}$", remaining_formula)
62
+
63
+ if not match:
64
+ return None
65
+
66
+ # Extract the operator formula
67
+ op_formula = match.group(1)
68
+
69
+ op = find_op(op_formula)
70
+ if op is None or not op.is_binary:
71
+ return None
72
+
73
+ # Include default values to pass validation
74
+ params = {
75
+ "interaction_op": op,
76
+ "descriptor_indices": [0], # Default index
77
+ "left_descriptor": ["default"], # Default left descriptor
78
+ "right_descriptor": ["default"], # Default right descriptor
79
+ }
80
+
81
+ if offset_params:
82
+ params.update(offset_params)
83
+
84
+ return cls(**params)
85
+
86
+ def get_params(self) -> Dict[str, str | None]:
87
+ res = super().get_params()
88
+ res.update(
89
+ {
90
+ "interaction_op": self._get_interaction_op_name(),
91
+ "descriptor_indices": self.descriptor_indices,
92
+ "left_descriptor": self.left_descriptor,
93
+ "right_descriptor": self.right_descriptor,
94
+ }
95
+ )
96
+ return res
97
+
98
+ def _get_interaction_op_name(self) -> str:
99
+ return self.interaction_op.alias or self.interaction_op.to_formula()
100
+
101
+ def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
102
+ left_mask = self._get_mask(data, self.left_descriptor)
103
+ left = self._extract_series(data, left_mask)
104
+
105
+ right_mask = self._get_mask(data, self.right_descriptor)
106
+ right = self._extract_series(data, right_mask)
107
+
108
+ interaction: pd.Series = self.interaction_op.calculate_binary(left, right)
109
+ interaction = interaction.reindex(self._get_index(data))
110
+ res = pd.Series(np.nan, index=data[-1].index, name=data[-1].name)
111
+ res.loc[left_mask] = interaction[left_mask].values
112
+ res.loc[right_mask] = interaction[right_mask].values
113
+ return res
114
+
115
+ def _get_mask(self, data: List[pd.Series], descriptor: List[str]) -> pd.Series:
116
+ mask = np.logical_and.reduce([data[i] == v for i, v in zip(self.descriptor_indices, descriptor)])
117
+ return mask
118
+
119
+ def _extract_series(self, data: List[pd.Series], mask: pd.Series) -> pd.Series:
120
+ masked_data = [d[mask] for d in data]
121
+ shifted = super().calculate_vector(masked_data)
122
+ shifted.index = self._get_index(masked_data)
123
+ return shifted
124
+
125
+ def _get_index(self, data: List[pd.Series]) -> pd.Series:
126
+ index = [d for i, d in enumerate(data[:-1]) if i not in self.descriptor_indices]
127
+ return index if len(index) > 1 else index[0]
128
+
129
+ def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
130
+ return ts.apply(lambda x: x).iloc[:, [-1]]