upgini 1.2.62a3818.dev3__tar.gz → 1.2.63__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/PKG-INFO +2 -1
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/pyproject.toml +1 -0
- upgini-1.2.63/src/upgini/__about__.py +1 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/all_operands.py +2 -2
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/binary.py +11 -11
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/date.py +6 -6
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/feature.py +6 -6
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/groupby.py +6 -6
- upgini-1.2.62a3818.dev3/src/upgini/autofe/operator.py → upgini-1.2.63/src/upgini/autofe/operand.py +9 -9
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/unary.py +11 -11
- upgini-1.2.62a3818.dev3/src/upgini/autofe/timeseries.py → upgini-1.2.63/src/upgini/autofe/vector.py +26 -6
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/features_enricher.py +1 -1
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/mstats.py +17 -2
- upgini-1.2.62a3818.dev3/src/upgini/__about__.py +0 -1
- upgini-1.2.62a3818.dev3/src/upgini/autofe/vector.py +0 -24
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/.gitignore +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/LICENSE +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/README.md +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/ads.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/dataset.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/errors.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/http.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/metadata.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/metrics.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/search_task.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/spinner.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.63
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -38,6 +38,7 @@ Requires-Dist: python-dateutil>=2.8.0
|
|
|
38
38
|
Requires-Dist: python-json-logger>=3.3.0
|
|
39
39
|
Requires-Dist: requests>=2.8.0
|
|
40
40
|
Requires-Dist: scikit-learn>=1.3.0
|
|
41
|
+
Requires-Dist: scipy>=1.10.0
|
|
41
42
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
42
43
|
Description-Content-Type: text/markdown
|
|
43
44
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.63"
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from upgini.autofe.
|
|
1
|
+
from upgini.autofe.operand import OperandRegistry
|
|
2
2
|
from upgini.autofe.unary import * # noqa
|
|
3
3
|
from upgini.autofe.binary import * # noqa
|
|
4
4
|
from upgini.autofe.groupby import * # noqa
|
|
@@ -7,4 +7,4 @@ from upgini.autofe.vector import * # noqa
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
def find_op(name):
|
|
10
|
-
return
|
|
10
|
+
return OperandRegistry.get_operand(name)
|
|
@@ -5,10 +5,10 @@ import numpy as np
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from jarowinkler import jarowinkler_similarity
|
|
7
7
|
|
|
8
|
-
from upgini.autofe.
|
|
8
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class Min(
|
|
11
|
+
class Min(PandasOperand):
|
|
12
12
|
name: str = "min"
|
|
13
13
|
is_binary: bool = True
|
|
14
14
|
is_symmetrical: bool = True
|
|
@@ -18,7 +18,7 @@ class Min(PandasOperator):
|
|
|
18
18
|
return np.minimum(left, right)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
class Max(
|
|
21
|
+
class Max(PandasOperand):
|
|
22
22
|
name: str = "max"
|
|
23
23
|
is_binary: bool = True
|
|
24
24
|
is_symmetrical: bool = True
|
|
@@ -28,7 +28,7 @@ class Max(PandasOperator):
|
|
|
28
28
|
return np.maximum(left, right)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
-
class Add(
|
|
31
|
+
class Add(PandasOperand, VectorizableMixin):
|
|
32
32
|
name: str = "+"
|
|
33
33
|
alias: str = "add"
|
|
34
34
|
is_binary: bool = True
|
|
@@ -47,7 +47,7 @@ class Add(PandasOperator, VectorizableMixin):
|
|
|
47
47
|
return d1.add(d2, axis=0)
|
|
48
48
|
|
|
49
49
|
|
|
50
|
-
class Subtract(
|
|
50
|
+
class Subtract(PandasOperand, VectorizableMixin):
|
|
51
51
|
name: str = "-"
|
|
52
52
|
alias: str = "sub"
|
|
53
53
|
is_binary: bool = True
|
|
@@ -66,7 +66,7 @@ class Subtract(PandasOperator, VectorizableMixin):
|
|
|
66
66
|
return d1.sub(d2, axis=0)
|
|
67
67
|
|
|
68
68
|
|
|
69
|
-
class Multiply(
|
|
69
|
+
class Multiply(PandasOperand, VectorizableMixin):
|
|
70
70
|
name: str = "*"
|
|
71
71
|
alias: str = "mul"
|
|
72
72
|
is_binary: bool = True
|
|
@@ -85,7 +85,7 @@ class Multiply(PandasOperator, VectorizableMixin):
|
|
|
85
85
|
return d1.mul(d2, axis=0)
|
|
86
86
|
|
|
87
87
|
|
|
88
|
-
class Divide(
|
|
88
|
+
class Divide(PandasOperand, VectorizableMixin):
|
|
89
89
|
name: str = "/"
|
|
90
90
|
alias: str = "div"
|
|
91
91
|
is_binary: bool = True
|
|
@@ -104,7 +104,7 @@ class Divide(PandasOperator, VectorizableMixin):
|
|
|
104
104
|
return d1.div(d2.replace(0, np.nan), axis=0)
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
class Combine(
|
|
107
|
+
class Combine(PandasOperand):
|
|
108
108
|
name: str = "Combine"
|
|
109
109
|
is_binary: bool = True
|
|
110
110
|
has_symmetry_importance: bool = True
|
|
@@ -116,7 +116,7 @@ class Combine(PandasOperator):
|
|
|
116
116
|
return pd.Series(temp, index=left.index)
|
|
117
117
|
|
|
118
118
|
|
|
119
|
-
class CombineThenFreq(
|
|
119
|
+
class CombineThenFreq(PandasOperand):
|
|
120
120
|
name: str = "CombineThenFreq"
|
|
121
121
|
is_binary: bool = True
|
|
122
122
|
is_symmetrical: bool = True
|
|
@@ -132,7 +132,7 @@ class CombineThenFreq(PandasOperator):
|
|
|
132
132
|
self._loc(temp, value_counts)
|
|
133
133
|
|
|
134
134
|
|
|
135
|
-
class Distance(
|
|
135
|
+
class Distance(PandasOperand):
|
|
136
136
|
name: str = "dist"
|
|
137
137
|
is_binary: bool = True
|
|
138
138
|
output_type: Optional[str] = "float"
|
|
@@ -170,7 +170,7 @@ class Sim(Distance):
|
|
|
170
170
|
return 1 - super().calculate_binary(left, right)
|
|
171
171
|
|
|
172
172
|
|
|
173
|
-
class StringSim(
|
|
173
|
+
class StringSim(PandasOperand, abc.ABC):
|
|
174
174
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
175
175
|
sims = []
|
|
176
176
|
for i in left.index:
|
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from pandas.core.arrays.timedeltas import TimedeltaArray
|
|
8
8
|
from pydantic import BaseModel, __version__ as pydantic_version
|
|
9
9
|
|
|
10
|
-
from upgini.autofe.
|
|
10
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
def get_pydantic_version():
|
|
@@ -43,7 +43,7 @@ class DateDiffMixin(BaseModel):
|
|
|
43
43
|
raise Exception(f"Unsupported difference unit: {self.diff_unit}")
|
|
44
44
|
|
|
45
45
|
|
|
46
|
-
class DateDiff(
|
|
46
|
+
class DateDiff(PandasOperand, DateDiffMixin):
|
|
47
47
|
name: str = "date_diff"
|
|
48
48
|
alias: Optional[str] = "date_diff_type1"
|
|
49
49
|
is_binary: bool = True
|
|
@@ -78,7 +78,7 @@ class DateDiff(PandasOperator, DateDiffMixin):
|
|
|
78
78
|
return x
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
class DateDiffType2(
|
|
81
|
+
class DateDiffType2(PandasOperand, DateDiffMixin):
|
|
82
82
|
name: str = "date_diff_type2"
|
|
83
83
|
is_binary: bool = True
|
|
84
84
|
has_symmetry_importance: bool = True
|
|
@@ -112,7 +112,7 @@ _ext_aggregations = {"nunique": (lambda x: len(np.unique(x)), 0), "count": (len,
|
|
|
112
112
|
_count_aggregations = ["nunique", "count"]
|
|
113
113
|
|
|
114
114
|
|
|
115
|
-
class DateListDiff(
|
|
115
|
+
class DateListDiff(PandasOperand, DateDiffMixin, ParametrizedOperand):
|
|
116
116
|
is_binary: bool = True
|
|
117
117
|
has_symmetry_importance: bool = True
|
|
118
118
|
|
|
@@ -183,7 +183,7 @@ class DateListDiff(PandasOperator, DateDiffMixin, ParametrizedOperator):
|
|
|
183
183
|
return method(x) if len(x) > 0 else default
|
|
184
184
|
|
|
185
185
|
|
|
186
|
-
class DateListDiffBounded(DateListDiff,
|
|
186
|
+
class DateListDiffBounded(DateListDiff, ParametrizedOperand):
|
|
187
187
|
lower_bound: Optional[int] = None
|
|
188
188
|
upper_bound: Optional[int] = None
|
|
189
189
|
|
|
@@ -217,7 +217,7 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
|
217
217
|
return super()._agg(x)
|
|
218
218
|
|
|
219
219
|
|
|
220
|
-
class DatePercentileBase(
|
|
220
|
+
class DatePercentileBase(PandasOperand, abc.ABC):
|
|
221
221
|
is_binary: bool = True
|
|
222
222
|
output_type: Optional[str] = "float"
|
|
223
223
|
|
|
@@ -7,7 +7,7 @@ import pandas as pd
|
|
|
7
7
|
from pandas._typing import DtypeObj
|
|
8
8
|
|
|
9
9
|
from upgini.autofe.all_operands import find_op
|
|
10
|
-
from upgini.autofe.
|
|
10
|
+
from upgini.autofe.operand import Operand, PandasOperand
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class Column:
|
|
@@ -65,7 +65,7 @@ class Column:
|
|
|
65
65
|
class Feature:
|
|
66
66
|
def __init__(
|
|
67
67
|
self,
|
|
68
|
-
op:
|
|
68
|
+
op: Operand,
|
|
69
69
|
children: List[Union[Column, "Feature"]],
|
|
70
70
|
data: Optional[pd.DataFrame] = None,
|
|
71
71
|
display_index: Optional[str] = None,
|
|
@@ -188,7 +188,7 @@ class Feature:
|
|
|
188
188
|
return self.children[0].infer_type(data)
|
|
189
189
|
|
|
190
190
|
def calculate(self, data: pd.DataFrame, is_root=False) -> Union[pd.Series, pd.DataFrame]:
|
|
191
|
-
if isinstance(self.op,
|
|
191
|
+
if isinstance(self.op, PandasOperand):
|
|
192
192
|
if self.op.is_vector:
|
|
193
193
|
ds = [child.calculate(data) for child in self.children]
|
|
194
194
|
new_data = self.op.calculate(data=ds)
|
|
@@ -324,7 +324,7 @@ class Feature:
|
|
|
324
324
|
|
|
325
325
|
class FeatureGroup:
|
|
326
326
|
def __init__(
|
|
327
|
-
self, op:
|
|
327
|
+
self, op: Operand, main_column: Optional[Union[Column, Feature]], children: List[Union[Column, Feature]]
|
|
328
328
|
):
|
|
329
329
|
self.op = op
|
|
330
330
|
self.main_column_node = main_column
|
|
@@ -345,7 +345,7 @@ class FeatureGroup:
|
|
|
345
345
|
return names
|
|
346
346
|
|
|
347
347
|
def calculate(self, data: pd.DataFrame, is_root=False) -> pd.DataFrame:
|
|
348
|
-
if isinstance(self.op,
|
|
348
|
+
if isinstance(self.op, PandasOperand):
|
|
349
349
|
main_column = None if self.main_column_node is None else self.main_column_node.get_display_name()
|
|
350
350
|
lower_order_children = []
|
|
351
351
|
if self.main_column_node is not None:
|
|
@@ -378,7 +378,7 @@ class FeatureGroup:
|
|
|
378
378
|
def make_groups(candidates: List[Feature]) -> List[Union[Feature, "FeatureGroup"]]:
|
|
379
379
|
grouped_features = []
|
|
380
380
|
|
|
381
|
-
def groupby_func(f: Feature) -> Tuple[
|
|
381
|
+
def groupby_func(f: Feature) -> Tuple[Operand, Union[Column, Feature]]:
|
|
382
382
|
return (f.op, f.children[0 if not f.op.is_vectorizable else f.op.group_index])
|
|
383
383
|
|
|
384
384
|
for op_child, features in itertools.groupby(candidates, groupby_func):
|
|
@@ -2,13 +2,13 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class GroupByThenAgg(
|
|
9
|
-
|
|
9
|
+
PandasOperand,
|
|
10
10
|
VectorizableMixin,
|
|
11
|
-
|
|
11
|
+
ParametrizedOperand,
|
|
12
12
|
):
|
|
13
13
|
agg: Optional[str]
|
|
14
14
|
is_vectorizable: bool = True
|
|
@@ -39,7 +39,7 @@ class GroupByThenAgg(
|
|
|
39
39
|
return temp.merge(d2, how="right", on=[group_column])[value_columns]
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
class GroupByThenRank(
|
|
42
|
+
class GroupByThenRank(PandasOperand, VectorizableMixin):
|
|
43
43
|
name: str = "GroupByThenRank"
|
|
44
44
|
is_vectorizable: bool = True
|
|
45
45
|
is_grouping: bool = True
|
|
@@ -58,7 +58,7 @@ class GroupByThenRank(PandasOperator, VectorizableMixin):
|
|
|
58
58
|
return temp.merge(d2.reset_index(), how="right", on=["index"])[value_columns]
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
class GroupByThenNUnique(
|
|
61
|
+
class GroupByThenNUnique(PandasOperand, VectorizableMixin):
|
|
62
62
|
name: str = "GroupByThenNUnique"
|
|
63
63
|
is_vectorizable: bool = True
|
|
64
64
|
is_grouping: bool = True
|
|
@@ -78,7 +78,7 @@ class GroupByThenNUnique(PandasOperator, VectorizableMixin):
|
|
|
78
78
|
return nunique.merge(d2, how="right", on=[group_column])[value_columns]
|
|
79
79
|
|
|
80
80
|
|
|
81
|
-
class GroupByThenFreq(
|
|
81
|
+
class GroupByThenFreq(PandasOperand):
|
|
82
82
|
name: str = "GroupByThenFreq"
|
|
83
83
|
is_grouping: bool = True
|
|
84
84
|
output_type: Optional[str] = "float"
|
upgini-1.2.62a3818.dev3/src/upgini/autofe/operator.py → upgini-1.2.63/src/upgini/autofe/operand.py
RENAMED
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
|
6
6
|
from pydantic import BaseModel
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
class
|
|
9
|
+
class OperandRegistry(type(BaseModel)):
|
|
10
10
|
_registry = {}
|
|
11
11
|
_parametrized_registry = []
|
|
12
12
|
|
|
@@ -20,9 +20,9 @@ class OperatorRegistry(type(BaseModel)):
|
|
|
20
20
|
base_names.update(b.__name__ for b in base.__bases__)
|
|
21
21
|
base_classes.extend(base.__bases__)
|
|
22
22
|
|
|
23
|
-
if "
|
|
23
|
+
if "Operand" in base_names:
|
|
24
24
|
# Track parametrized operands separately
|
|
25
|
-
if "
|
|
25
|
+
if "ParametrizedOperand" in base_names:
|
|
26
26
|
cls._parametrized_registry.append(new_class)
|
|
27
27
|
else:
|
|
28
28
|
try:
|
|
@@ -33,7 +33,7 @@ class OperatorRegistry(type(BaseModel)):
|
|
|
33
33
|
return new_class
|
|
34
34
|
|
|
35
35
|
@classmethod
|
|
36
|
-
def get_operand(cls, name: str) -> Optional["
|
|
36
|
+
def get_operand(cls, name: str) -> Optional["Operand"]:
|
|
37
37
|
# First try to resolve as a parametrized operand formula
|
|
38
38
|
for operand_cls in cls._parametrized_registry:
|
|
39
39
|
resolved = operand_cls.from_formula(name)
|
|
@@ -46,7 +46,7 @@ class OperatorRegistry(type(BaseModel)):
|
|
|
46
46
|
return None
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
class
|
|
49
|
+
class Operand(BaseModel, metaclass=OperandRegistry):
|
|
50
50
|
name: Optional[str] = None
|
|
51
51
|
alias: Optional[str] = None
|
|
52
52
|
is_unary: bool = False
|
|
@@ -75,7 +75,7 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
|
|
|
75
75
|
return self.name
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
class
|
|
78
|
+
class ParametrizedOperand(Operand, abc.ABC):
|
|
79
79
|
|
|
80
80
|
@abc.abstractmethod
|
|
81
81
|
def to_formula(self) -> str:
|
|
@@ -83,14 +83,14 @@ class ParametrizedOperator(Operator, abc.ABC):
|
|
|
83
83
|
|
|
84
84
|
@classmethod
|
|
85
85
|
@abc.abstractmethod
|
|
86
|
-
def from_formula(cls, formula: str) -> Optional["
|
|
86
|
+
def from_formula(cls, formula: str) -> Optional["Operand"]:
|
|
87
87
|
pass
|
|
88
88
|
|
|
89
89
|
|
|
90
90
|
MAIN_COLUMN = "main_column"
|
|
91
91
|
|
|
92
92
|
|
|
93
|
-
class
|
|
93
|
+
class PandasOperand(Operand, abc.ABC):
|
|
94
94
|
def calculate(self, **kwargs) -> pd.Series:
|
|
95
95
|
if self.is_unary:
|
|
96
96
|
return self.calculate_unary(kwargs["data"])
|
|
@@ -131,7 +131,7 @@ class PandasOperator(Operator, abc.ABC):
|
|
|
131
131
|
return value
|
|
132
132
|
|
|
133
133
|
|
|
134
|
-
class VectorizableMixin(
|
|
134
|
+
class VectorizableMixin(Operand):
|
|
135
135
|
group_index: int = 1
|
|
136
136
|
|
|
137
137
|
def validate_calculation(self, input_columns: List[str], **kwargs) -> Tuple[str, List[str]]:
|
|
@@ -2,10 +2,10 @@ from typing import Dict, Optional
|
|
|
2
2
|
import numpy as np
|
|
3
3
|
import pandas as pd
|
|
4
4
|
|
|
5
|
-
from upgini.autofe.
|
|
5
|
+
from upgini.autofe.operand import PandasOperand, VectorizableMixin
|
|
6
6
|
|
|
7
7
|
|
|
8
|
-
class Abs(
|
|
8
|
+
class Abs(PandasOperand, VectorizableMixin):
|
|
9
9
|
name: str = "abs"
|
|
10
10
|
is_unary: bool = True
|
|
11
11
|
is_vectorizable: bool = True
|
|
@@ -20,7 +20,7 @@ class Abs(PandasOperator, VectorizableMixin):
|
|
|
20
20
|
# return data.abs()
|
|
21
21
|
|
|
22
22
|
|
|
23
|
-
class Log(
|
|
23
|
+
class Log(PandasOperand, VectorizableMixin):
|
|
24
24
|
name: str = "log"
|
|
25
25
|
is_unary: bool = True
|
|
26
26
|
is_vectorizable: bool = True
|
|
@@ -34,7 +34,7 @@ class Log(PandasOperator, VectorizableMixin):
|
|
|
34
34
|
return self._round_value(np.log(data.replace(0, np.nan).abs()), 10)
|
|
35
35
|
|
|
36
36
|
|
|
37
|
-
class Sqrt(
|
|
37
|
+
class Sqrt(PandasOperand, VectorizableMixin):
|
|
38
38
|
name: str = "sqrt"
|
|
39
39
|
is_unary: bool = True
|
|
40
40
|
is_vectorizable: bool = True
|
|
@@ -48,7 +48,7 @@ class Sqrt(PandasOperator, VectorizableMixin):
|
|
|
48
48
|
return self._round_value(np.sqrt(data.abs()))
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
class Square(
|
|
51
|
+
class Square(PandasOperand, VectorizableMixin):
|
|
52
52
|
name: str = "square"
|
|
53
53
|
is_unary: bool = True
|
|
54
54
|
is_vectorizable: bool = True
|
|
@@ -61,7 +61,7 @@ class Square(PandasOperator, VectorizableMixin):
|
|
|
61
61
|
return np.square(data)
|
|
62
62
|
|
|
63
63
|
|
|
64
|
-
class Sigmoid(
|
|
64
|
+
class Sigmoid(PandasOperand, VectorizableMixin):
|
|
65
65
|
name: str = "sigmoid"
|
|
66
66
|
is_unary: bool = True
|
|
67
67
|
is_vectorizable: bool = True
|
|
@@ -75,7 +75,7 @@ class Sigmoid(PandasOperator, VectorizableMixin):
|
|
|
75
75
|
return self._round_value(1 / (1 + np.exp(-data)))
|
|
76
76
|
|
|
77
77
|
|
|
78
|
-
class Floor(
|
|
78
|
+
class Floor(PandasOperand, VectorizableMixin):
|
|
79
79
|
name: str = "floor"
|
|
80
80
|
is_unary: bool = True
|
|
81
81
|
is_vectorizable: bool = True
|
|
@@ -90,7 +90,7 @@ class Floor(PandasOperator, VectorizableMixin):
|
|
|
90
90
|
return np.floor(data)
|
|
91
91
|
|
|
92
92
|
|
|
93
|
-
class Residual(
|
|
93
|
+
class Residual(PandasOperand, VectorizableMixin):
|
|
94
94
|
name: str = "residual"
|
|
95
95
|
is_unary: bool = True
|
|
96
96
|
is_vectorizable: bool = True
|
|
@@ -104,7 +104,7 @@ class Residual(PandasOperator, VectorizableMixin):
|
|
|
104
104
|
return data - np.floor(data)
|
|
105
105
|
|
|
106
106
|
|
|
107
|
-
class Freq(
|
|
107
|
+
class Freq(PandasOperand):
|
|
108
108
|
name: str = "freq"
|
|
109
109
|
is_unary: bool = True
|
|
110
110
|
output_type: Optional[str] = "float"
|
|
@@ -116,7 +116,7 @@ class Freq(PandasOperator):
|
|
|
116
116
|
return self._loc(data, value_counts)
|
|
117
117
|
|
|
118
118
|
|
|
119
|
-
class Norm(
|
|
119
|
+
class Norm(PandasOperand):
|
|
120
120
|
name: str = "norm"
|
|
121
121
|
is_unary: bool = True
|
|
122
122
|
output_type: Optional[str] = "float"
|
|
@@ -148,7 +148,7 @@ class Norm(PandasOperator):
|
|
|
148
148
|
return res
|
|
149
149
|
|
|
150
150
|
|
|
151
|
-
class Embeddings(
|
|
151
|
+
class Embeddings(PandasOperand):
|
|
152
152
|
name: str = "emb"
|
|
153
153
|
is_unary: bool = True
|
|
154
154
|
input_type: Optional[str] = "string"
|
upgini-1.2.62a3818.dev3/src/upgini/autofe/timeseries.py → upgini-1.2.63/src/upgini/autofe/vector.py
RENAMED
|
@@ -2,15 +2,35 @@ import abc
|
|
|
2
2
|
from typing import Dict, List, Optional
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
|
-
from upgini.autofe.operator import PandasOperator, ParametrizedOperator
|
|
6
5
|
|
|
7
6
|
try:
|
|
8
7
|
from pydantic import field_validator as validator # V2
|
|
9
8
|
except ImportError:
|
|
10
9
|
from pydantic import validator # V1
|
|
11
10
|
|
|
11
|
+
from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
|
|
14
|
+
class Mean(PandasOperand, VectorizableMixin):
|
|
15
|
+
name: str = "mean"
|
|
16
|
+
output_type: Optional[str] = "float"
|
|
17
|
+
is_vector: bool = True
|
|
18
|
+
group_index: int = 0
|
|
19
|
+
|
|
20
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
21
|
+
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Sum(PandasOperand, VectorizableMixin):
|
|
25
|
+
name: str = "sum"
|
|
26
|
+
is_vector: bool = True
|
|
27
|
+
group_index: int = 0
|
|
28
|
+
|
|
29
|
+
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
30
|
+
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TimeSeriesBase(PandasOperand, abc.ABC):
|
|
14
34
|
is_vector: bool = True
|
|
15
35
|
date_unit: Optional[str] = None
|
|
16
36
|
offset_size: int = 0
|
|
@@ -35,7 +55,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
|
35
55
|
ts.set_index(date.name, inplace=True)
|
|
36
56
|
ts = ts[ts.index.notna()].sort_index()
|
|
37
57
|
ts = (
|
|
38
|
-
ts.groupby([c.name for c in data[1:-1]]
|
|
58
|
+
ts.groupby([c.name for c in data[1:-1]])
|
|
39
59
|
.apply(self._shift)[data[-1].name]
|
|
40
60
|
.to_frame()
|
|
41
61
|
.reset_index()
|
|
@@ -64,10 +84,10 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
|
64
84
|
pass
|
|
65
85
|
|
|
66
86
|
|
|
67
|
-
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()
|
|
87
|
+
_roll_aggregations = {"norm_mean": lambda x: x[-1] / x.mean()}
|
|
68
88
|
|
|
69
89
|
|
|
70
|
-
class Roll(TimeSeriesBase,
|
|
90
|
+
class Roll(TimeSeriesBase, ParametrizedOperand):
|
|
71
91
|
aggregation: str
|
|
72
92
|
window_size: int = 1
|
|
73
93
|
window_unit: str = "D"
|
|
@@ -142,7 +162,7 @@ class Roll(TimeSeriesBase, ParametrizedOperator):
|
|
|
142
162
|
)
|
|
143
163
|
|
|
144
164
|
|
|
145
|
-
class Lag(TimeSeriesBase,
|
|
165
|
+
class Lag(TimeSeriesBase, ParametrizedOperand):
|
|
146
166
|
lag_size: int
|
|
147
167
|
lag_unit: str = "D"
|
|
148
168
|
|
|
@@ -31,7 +31,7 @@ from sklearn.exceptions import NotFittedError
|
|
|
31
31
|
from sklearn.model_selection import BaseCrossValidator
|
|
32
32
|
|
|
33
33
|
from upgini.autofe.feature import Feature
|
|
34
|
-
from upgini.autofe.
|
|
34
|
+
from upgini.autofe.vector import TimeSeriesBase
|
|
35
35
|
from upgini.data_source.data_source_publisher import CommercialSchema
|
|
36
36
|
from upgini.dataset import Dataset
|
|
37
37
|
from upgini.errors import HttpError, ValidationError
|
|
@@ -3,7 +3,6 @@ from collections import namedtuple
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import numpy.ma as ma
|
|
6
|
-
import scipy
|
|
7
6
|
from joblib import Parallel, delayed
|
|
8
7
|
from numpy import ndarray
|
|
9
8
|
from psutil import cpu_count
|
|
@@ -116,6 +115,22 @@ def spearmanr(
|
|
|
116
115
|
if nan_policy == "omit":
|
|
117
116
|
x = mask_fn(x)
|
|
118
117
|
|
|
118
|
+
# - dof: degrees of freedom
|
|
119
|
+
# - t_stat: t-statistic
|
|
120
|
+
# - alternative: 'two-sided', 'greater', 'less'
|
|
121
|
+
def compute_t_pvalue(t_stat, dof, alternative='two-sided'):
|
|
122
|
+
from scipy.stats import t
|
|
123
|
+
|
|
124
|
+
if alternative == "two-sided":
|
|
125
|
+
prob = 2 * t.sf(abs(t_stat), dof)
|
|
126
|
+
elif alternative == "greater":
|
|
127
|
+
prob = t.sf(t_stat, dof)
|
|
128
|
+
elif alternative == "less":
|
|
129
|
+
prob = t.cdf(t_stat, dof)
|
|
130
|
+
else:
|
|
131
|
+
raise ValueError(f"Unknown alternative: {alternative}")
|
|
132
|
+
return t_stat, prob
|
|
133
|
+
|
|
119
134
|
def _spearmanr_2cols(x):
|
|
120
135
|
# Mask the same observations for all variables, and then drop those
|
|
121
136
|
# observations (can't leave them masked, rankdata is weird).
|
|
@@ -142,7 +157,7 @@ def spearmanr(
|
|
|
142
157
|
# errors before taking the square root
|
|
143
158
|
t = rs * np.sqrt((dof / ((rs + 1.0) * (1.0 - rs))).clip(0))
|
|
144
159
|
|
|
145
|
-
t, prob =
|
|
160
|
+
t, prob = compute_t_pvalue(dof, t, alternative)
|
|
146
161
|
|
|
147
162
|
# For backwards compatibility, return scalars when comparing 2 columns
|
|
148
163
|
if rs.shape == (2, 2):
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.62a3818.dev3"
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
from upgini.autofe.operator import PandasOperator, VectorizableMixin
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Mean(PandasOperator, VectorizableMixin):
|
|
9
|
-
name: str = "mean"
|
|
10
|
-
output_type: Optional[str] = "float"
|
|
11
|
-
is_vector: bool = True
|
|
12
|
-
group_index: int = 0
|
|
13
|
-
|
|
14
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
15
|
-
return pd.DataFrame(data).T.fillna(0).mean(axis=1)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class Sum(PandasOperator, VectorizableMixin):
|
|
19
|
-
name: str = "sum"
|
|
20
|
-
is_vector: bool = True
|
|
21
|
-
group_index: int = 0
|
|
22
|
-
|
|
23
|
-
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
|
24
|
-
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.62a3818.dev3 → upgini-1.2.63}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|