mloda 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/METADATA +10 -10
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/RECORD +92 -91
- mloda_core/abstract_plugins/components/base_artifact.py +3 -1
- mloda_core/abstract_plugins/components/feature.py +4 -4
- mloda_core/abstract_plugins/components/feature_chainer/feature_chain_parser.py +44 -17
- mloda_core/abstract_plugins/components/feature_collection.py +2 -2
- mloda_core/abstract_plugins/components/feature_group_version.py +4 -4
- mloda_core/abstract_plugins/components/feature_name.py +0 -3
- mloda_core/abstract_plugins/components/input_data/base_input_data.py +3 -3
- mloda_core/abstract_plugins/components/link.py +113 -29
- mloda_core/abstract_plugins/components/options.py +10 -10
- mloda_core/api/prepare/setup_compute_framework.py +2 -2
- mloda_core/api/request.py +44 -13
- mloda_core/core/step/feature_group_step.py +2 -1
- mloda_core/filter/filter_engine.py +3 -12
- mloda_core/filter/filter_parameter.py +55 -0
- mloda_core/filter/single_filter.py +4 -4
- mloda_core/prepare/execution_plan.py +12 -6
- mloda_core/prepare/graph/graph.py +3 -3
- mloda_core/prepare/identify_feature_group.py +10 -3
- mloda_core/prepare/resolve_links.py +86 -18
- mloda_core/runtime/flight/flight_server.py +1 -1
- mloda_core/runtime/run.py +7 -5
- mloda_core/runtime/worker/multiprocessing_worker.py +11 -9
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +7 -33
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +22 -12
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +5 -5
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +8 -34
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_merge_engine.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +7 -33
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +1 -1
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +13 -32
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +1 -1
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +13 -32
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +4 -4
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +12 -18
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +20 -17
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +8 -8
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +8 -8
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +7 -7
- mloda_plugins/feature_group/experimental/clustering/base.py +26 -26
- mloda_plugins/feature_group/experimental/clustering/pandas.py +31 -29
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +23 -22
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +16 -16
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +9 -11
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +8 -8
- mloda_plugins/feature_group/experimental/default_options_key.py +1 -1
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +17 -15
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +30 -18
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +35 -35
- mloda_plugins/feature_group/experimental/forecasting/base.py +39 -29
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +18 -18
- mloda_plugins/feature_group/experimental/geo_distance/base.py +18 -20
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +6 -6
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +2 -2
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +2 -2
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +2 -2
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +3 -2
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +8 -12
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +11 -12
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +9 -14
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +8 -9
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/source_input_feature.py +10 -10
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +8 -11
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +27 -25
- mloda_plugins/feature_group/experimental/time_window/pandas.py +8 -8
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +6 -6
- mloda_plugins/feature_group/input_data/read_context_files.py +1 -1
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +1 -1
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/WHEEL +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/licenses/NOTICE.md +0 -0
- {mloda-0.3.0.dist-info → mloda-0.3.2.dist-info}/top_level.txt +0 -0
mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py
CHANGED
|
@@ -5,7 +5,7 @@ from mloda_core.abstract_plugins.components.framework_transformer.base_transform
|
|
|
5
5
|
try:
|
|
6
6
|
import polars as pl
|
|
7
7
|
except ImportError:
|
|
8
|
-
pl = None # type: ignore
|
|
8
|
+
pl = None # type: ignore[assignment]
|
|
9
9
|
|
|
10
10
|
try:
|
|
11
11
|
import pyarrow as pa
|
|
@@ -13,7 +13,7 @@ except ImportError:
|
|
|
13
13
|
pa = None
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class
|
|
16
|
+
class PolarsLazyPyArrowTransformer(BaseTransformer):
|
|
17
17
|
"""
|
|
18
18
|
Transformer for converting between Polars LazyFrame and PyArrow Table.
|
|
19
19
|
|
|
@@ -66,4 +66,4 @@ class PolarsLazyPyarrowTransformer(BaseTransformer):
|
|
|
66
66
|
raise ImportError("Polars is not installed. To be able to use this framework, please install polars.")
|
|
67
67
|
# Convert PyArrow to DataFrame, then make it lazy
|
|
68
68
|
df = pl.from_arrow(data)
|
|
69
|
-
return df.lazy() # type: ignore
|
|
69
|
+
return df.lazy() # type: ignore[union-attr]
|
|
@@ -5,7 +5,7 @@ from mloda_core.abstract_plugins.components.framework_transformer.base_transform
|
|
|
5
5
|
try:
|
|
6
6
|
import polars as pl
|
|
7
7
|
except ImportError:
|
|
8
|
-
pl = None # type: ignore
|
|
8
|
+
pl = None # type: ignore[assignment]
|
|
9
9
|
|
|
10
10
|
try:
|
|
11
11
|
import pyarrow as pa
|
|
@@ -13,7 +13,7 @@ except ImportError:
|
|
|
13
13
|
pa = None
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
class
|
|
16
|
+
class PolarsPyArrowTransformer(BaseTransformer):
|
|
17
17
|
"""
|
|
18
18
|
Transformer for converting between Polars DataFrame and PyArrow Table.
|
|
19
19
|
|
|
@@ -40,11 +40,7 @@ class PyArrowFilterEngine(BaseFilterEngine):
|
|
|
40
40
|
column_name = str(filter_feature.name)
|
|
41
41
|
|
|
42
42
|
# Extract the value from the parameter
|
|
43
|
-
value =
|
|
44
|
-
for param in filter_feature.parameter:
|
|
45
|
-
if param[0] == "value":
|
|
46
|
-
value = param[1]
|
|
47
|
-
break
|
|
43
|
+
value = filter_feature.parameter.value
|
|
48
44
|
|
|
49
45
|
if value is None:
|
|
50
46
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -59,14 +55,8 @@ class PyArrowFilterEngine(BaseFilterEngine):
|
|
|
59
55
|
column_name = str(filter_feature.name)
|
|
60
56
|
|
|
61
57
|
# Check if this is a complex parameter with max/max_exclusive or a simple one with value
|
|
62
|
-
has_max =
|
|
63
|
-
has_value =
|
|
64
|
-
|
|
65
|
-
for param in filter_feature.parameter:
|
|
66
|
-
if param[0] == "max":
|
|
67
|
-
has_max = True
|
|
68
|
-
elif param[0] == "value":
|
|
69
|
-
has_value = True
|
|
58
|
+
has_max = filter_feature.parameter.max_value is not None
|
|
59
|
+
has_value = filter_feature.parameter.value is not None
|
|
70
60
|
|
|
71
61
|
if has_max:
|
|
72
62
|
# Complex parameter - use get_min_max_operator
|
|
@@ -90,11 +80,7 @@ class PyArrowFilterEngine(BaseFilterEngine):
|
|
|
90
80
|
return data.filter(mask)
|
|
91
81
|
elif has_value:
|
|
92
82
|
# Simple parameter - extract the value
|
|
93
|
-
value =
|
|
94
|
-
for param in filter_feature.parameter:
|
|
95
|
-
if param[0] == "value":
|
|
96
|
-
value = param[1]
|
|
97
|
-
break
|
|
83
|
+
value = filter_feature.parameter.value
|
|
98
84
|
|
|
99
85
|
if value is None:
|
|
100
86
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -111,11 +97,7 @@ class PyArrowFilterEngine(BaseFilterEngine):
|
|
|
111
97
|
column_name = str(filter_feature.name)
|
|
112
98
|
|
|
113
99
|
# Extract the value from the parameter
|
|
114
|
-
value =
|
|
115
|
-
for param in filter_feature.parameter:
|
|
116
|
-
if param[0] == "value":
|
|
117
|
-
value = param[1]
|
|
118
|
-
break
|
|
100
|
+
value = filter_feature.parameter.value
|
|
119
101
|
|
|
120
102
|
if value is None:
|
|
121
103
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -130,11 +112,7 @@ class PyArrowFilterEngine(BaseFilterEngine):
|
|
|
130
112
|
column_name = str(filter_feature.name)
|
|
131
113
|
|
|
132
114
|
# Extract the value from the parameter
|
|
133
|
-
value =
|
|
134
|
-
for param in filter_feature.parameter:
|
|
135
|
-
if param[0] == "value":
|
|
136
|
-
value = param[1]
|
|
137
|
-
break
|
|
115
|
+
value = filter_feature.parameter.value
|
|
138
116
|
|
|
139
117
|
if value is None:
|
|
140
118
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -151,11 +129,7 @@ class PyArrowFilterEngine(BaseFilterEngine):
|
|
|
151
129
|
column_name = str(filter_feature.name)
|
|
152
130
|
|
|
153
131
|
# Extract the values from the parameter
|
|
154
|
-
values =
|
|
155
|
-
for param in filter_feature.parameter:
|
|
156
|
-
if param[0] == "values":
|
|
157
|
-
values = param[1]
|
|
158
|
-
break
|
|
132
|
+
values = filter_feature.parameter.values
|
|
159
133
|
|
|
160
134
|
if values is None:
|
|
161
135
|
raise ValueError(f"Filter parameter 'values' not found in {filter_feature.parameter}")
|
mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py
CHANGED
|
@@ -45,11 +45,8 @@ class PythonDictFilterEngine(BaseFilterEngine):
|
|
|
45
45
|
column_name = filter_feature.name
|
|
46
46
|
|
|
47
47
|
# Extract the value from the parameter
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if param[0] == "value":
|
|
51
|
-
value = param[1]
|
|
52
|
-
break
|
|
48
|
+
|
|
49
|
+
value = filter_feature.parameter.value
|
|
53
50
|
|
|
54
51
|
if value is None:
|
|
55
52
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -61,14 +58,10 @@ class PythonDictFilterEngine(BaseFilterEngine):
|
|
|
61
58
|
column_name = filter_feature.name
|
|
62
59
|
|
|
63
60
|
# Check if this is a complex parameter with max/max_exclusive or a simple one with value
|
|
64
|
-
has_max = False
|
|
65
|
-
has_value = False
|
|
66
61
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
elif param[0] == "value":
|
|
71
|
-
has_value = True
|
|
62
|
+
has_max = filter_feature.parameter.max_value is not None
|
|
63
|
+
|
|
64
|
+
has_value = filter_feature.parameter.value is not None
|
|
72
65
|
|
|
73
66
|
if has_max:
|
|
74
67
|
# Complex parameter - use get_min_max_operator
|
|
@@ -94,11 +87,8 @@ class PythonDictFilterEngine(BaseFilterEngine):
|
|
|
94
87
|
]
|
|
95
88
|
elif has_value:
|
|
96
89
|
# Simple parameter - extract the value
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if param[0] == "value":
|
|
100
|
-
value = param[1]
|
|
101
|
-
break
|
|
90
|
+
|
|
91
|
+
value = filter_feature.parameter.value
|
|
102
92
|
|
|
103
93
|
if value is None:
|
|
104
94
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -112,11 +102,8 @@ class PythonDictFilterEngine(BaseFilterEngine):
|
|
|
112
102
|
column_name = filter_feature.name
|
|
113
103
|
|
|
114
104
|
# Extract the value from the parameter
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
if param[0] == "value":
|
|
118
|
-
value = param[1]
|
|
119
|
-
break
|
|
105
|
+
|
|
106
|
+
value = filter_feature.parameter.value
|
|
120
107
|
|
|
121
108
|
if value is None:
|
|
122
109
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -128,11 +115,8 @@ class PythonDictFilterEngine(BaseFilterEngine):
|
|
|
128
115
|
column_name = filter_feature.name
|
|
129
116
|
|
|
130
117
|
# Extract the value from the parameter
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
if param[0] == "value":
|
|
134
|
-
value = param[1]
|
|
135
|
-
break
|
|
118
|
+
|
|
119
|
+
value = filter_feature.parameter.value
|
|
136
120
|
|
|
137
121
|
if value is None:
|
|
138
122
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -151,11 +135,8 @@ class PythonDictFilterEngine(BaseFilterEngine):
|
|
|
151
135
|
column_name = filter_feature.name
|
|
152
136
|
|
|
153
137
|
# Extract the values from the parameter
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
if param[0] == "values":
|
|
157
|
-
values = param[1]
|
|
158
|
-
break
|
|
138
|
+
|
|
139
|
+
values = filter_feature.parameter.values
|
|
159
140
|
|
|
160
141
|
if values is None:
|
|
161
142
|
raise ValueError(f"Filter parameter 'values' not found in {filter_feature.parameter}")
|
|
@@ -82,7 +82,7 @@ class PythonDictFramework(ComputeFrameWork):
|
|
|
82
82
|
|
|
83
83
|
transformed_data = self.apply_compute_framework_transformer(data)
|
|
84
84
|
if transformed_data is not None:
|
|
85
|
-
return transformed_data # type: ignore
|
|
85
|
+
return transformed_data # type: ignore[no-any-return]
|
|
86
86
|
|
|
87
87
|
if isinstance(data, dict):
|
|
88
88
|
"""Initial data: Transform columnar dict to row-based list of dicts"""
|
|
@@ -37,11 +37,8 @@ class SparkFilterEngine(BaseFilterEngine):
|
|
|
37
37
|
column_name = filter_feature.name.name
|
|
38
38
|
|
|
39
39
|
# Extract the value from the parameter
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
if param[0] == "value":
|
|
43
|
-
value = param[1]
|
|
44
|
-
break
|
|
40
|
+
|
|
41
|
+
value = filter_feature.parameter.value
|
|
45
42
|
|
|
46
43
|
if value is None:
|
|
47
44
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -53,14 +50,10 @@ class SparkFilterEngine(BaseFilterEngine):
|
|
|
53
50
|
column_name = filter_feature.name.name
|
|
54
51
|
|
|
55
52
|
# Check if this is a complex parameter with max/max_exclusive or a simple one with value
|
|
56
|
-
has_max = False
|
|
57
|
-
has_value = False
|
|
58
53
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
elif param[0] == "value":
|
|
63
|
-
has_value = True
|
|
54
|
+
has_max = filter_feature.parameter.max_value is not None
|
|
55
|
+
|
|
56
|
+
has_value = filter_feature.parameter.value is not None
|
|
64
57
|
|
|
65
58
|
if has_max:
|
|
66
59
|
# Complex parameter - use get_min_max_operator
|
|
@@ -82,11 +75,8 @@ class SparkFilterEngine(BaseFilterEngine):
|
|
|
82
75
|
condition = F.col(column_name) <= max_parameter
|
|
83
76
|
elif has_value:
|
|
84
77
|
# Simple parameter - extract the value
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
if param[0] == "value":
|
|
88
|
-
value = param[1]
|
|
89
|
-
break
|
|
78
|
+
|
|
79
|
+
value = filter_feature.parameter.value
|
|
90
80
|
|
|
91
81
|
if value is None:
|
|
92
82
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -102,11 +92,8 @@ class SparkFilterEngine(BaseFilterEngine):
|
|
|
102
92
|
column_name = filter_feature.name.name
|
|
103
93
|
|
|
104
94
|
# Extract the value from the parameter
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
if param[0] == "value":
|
|
108
|
-
value = param[1]
|
|
109
|
-
break
|
|
95
|
+
|
|
96
|
+
value = filter_feature.parameter.value
|
|
110
97
|
|
|
111
98
|
if value is None:
|
|
112
99
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -118,11 +105,8 @@ class SparkFilterEngine(BaseFilterEngine):
|
|
|
118
105
|
column_name = filter_feature.name.name
|
|
119
106
|
|
|
120
107
|
# Extract the value from the parameter
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
if param[0] == "value":
|
|
124
|
-
value = param[1]
|
|
125
|
-
break
|
|
108
|
+
|
|
109
|
+
value = filter_feature.parameter.value
|
|
126
110
|
|
|
127
111
|
if value is None:
|
|
128
112
|
raise ValueError(f"Filter parameter 'value' not found in {filter_feature.parameter}")
|
|
@@ -135,11 +119,8 @@ class SparkFilterEngine(BaseFilterEngine):
|
|
|
135
119
|
column_name = filter_feature.name.name
|
|
136
120
|
|
|
137
121
|
# Extract the values from the parameter
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
if param[0] == "values":
|
|
141
|
-
values = param[1]
|
|
142
|
-
break
|
|
122
|
+
|
|
123
|
+
values = filter_feature.parameter.values
|
|
143
124
|
|
|
144
125
|
if values is None:
|
|
145
126
|
raise ValueError(f"Filter parameter 'values' not found in {filter_feature.parameter}")
|
|
@@ -119,14 +119,14 @@ class SparkFramework(ComputeFrameWork):
|
|
|
119
119
|
|
|
120
120
|
# Handle empty dict
|
|
121
121
|
if not data:
|
|
122
|
-
return spark.createDataFrame([], StructType([])) # type: ignore
|
|
122
|
+
return spark.createDataFrame([], StructType([])) # type: ignore[union-attr]
|
|
123
123
|
|
|
124
124
|
# Infer schema from the first row of data
|
|
125
125
|
first_key = next(iter(data.keys()))
|
|
126
126
|
if not data[first_key]: # Empty list
|
|
127
127
|
schema_fields = [StructField(col, StringType(), True) for col in data.keys()]
|
|
128
128
|
schema = StructType(schema_fields)
|
|
129
|
-
return spark.createDataFrame([], schema) # type: ignore
|
|
129
|
+
return spark.createDataFrame([], schema) # type: ignore[union-attr]
|
|
130
130
|
|
|
131
131
|
# Create schema based on first values
|
|
132
132
|
schema_fields = []
|
|
@@ -146,9 +146,9 @@ class SparkFramework(ComputeFrameWork):
|
|
|
146
146
|
for i in range(num_rows):
|
|
147
147
|
row = tuple(data[col][i] for col in data.keys())
|
|
148
148
|
rows.append(row)
|
|
149
|
-
return spark.createDataFrame(rows, schema) # type: ignore
|
|
149
|
+
return spark.createDataFrame(rows, schema) # type: ignore[union-attr]
|
|
150
150
|
else:
|
|
151
|
-
return spark.createDataFrame([], schema) # type: ignore
|
|
151
|
+
return spark.createDataFrame([], schema) # type: ignore[union-attr]
|
|
152
152
|
|
|
153
153
|
if hasattr(data, "__iter__") and not isinstance(data, (str, bytes, DataFrame)):
|
|
154
154
|
"""Added data: Add column to DataFrame"""
|
|
@@ -14,7 +14,7 @@ from mloda_plugins.feature_group.experimental.default_options_key import Default
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def process_nested_features(options: Dict[str, Any]) -> Dict[str, Any]:
|
|
17
|
-
"""Recursively convert nested
|
|
17
|
+
"""Recursively convert nested in_features dicts to Feature objects.
|
|
18
18
|
|
|
19
19
|
Args:
|
|
20
20
|
options: Dictionary of options that may contain nested feature definitions
|
|
@@ -24,11 +24,11 @@ def process_nested_features(options: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
24
24
|
"""
|
|
25
25
|
processed: Dict[str, Any] = {}
|
|
26
26
|
for key, value in options.items():
|
|
27
|
-
if key == "
|
|
27
|
+
if key == "in_features" and isinstance(value, dict):
|
|
28
28
|
# This is a nested feature definition - convert it to a Feature object
|
|
29
29
|
feature_name = value.get("name")
|
|
30
30
|
if not feature_name:
|
|
31
|
-
raise ValueError(f"Nested
|
|
31
|
+
raise ValueError(f"Nested in_features must have a 'name' field: {value}")
|
|
32
32
|
|
|
33
33
|
# Recursively process nested options
|
|
34
34
|
nested_options = value.get("options", {})
|
|
@@ -39,17 +39,15 @@ def process_nested_features(options: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
39
39
|
if mloda_sources:
|
|
40
40
|
if isinstance(mloda_sources, list):
|
|
41
41
|
# For list, convert each to string (single sources) or keep as-is
|
|
42
|
-
processed_nested_options["
|
|
42
|
+
processed_nested_options["in_features"] = (
|
|
43
43
|
mloda_sources if len(mloda_sources) > 1 else mloda_sources[0]
|
|
44
44
|
)
|
|
45
45
|
elif isinstance(mloda_sources, dict):
|
|
46
46
|
# Recursively create Feature for mloda_sources
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
]
|
|
50
|
-
processed_nested_options["mloda_source_features"] = mloda_source_features
|
|
47
|
+
in_features = process_nested_features({"in_features": mloda_sources})["in_features"]
|
|
48
|
+
processed_nested_options["in_features"] = in_features
|
|
51
49
|
else:
|
|
52
|
-
processed_nested_options["
|
|
50
|
+
processed_nested_options["in_features"] = mloda_sources
|
|
53
51
|
|
|
54
52
|
# Create the Feature object
|
|
55
53
|
processed[key] = Feature(name=feature_name, options=processed_nested_options)
|
|
@@ -104,7 +102,7 @@ def load_features_from_config(config_str: str, format: str = "json") -> List[Uni
|
|
|
104
102
|
# Handle mloda_sources if present
|
|
105
103
|
if item.mloda_sources:
|
|
106
104
|
# Always convert to frozenset for consistency
|
|
107
|
-
context[DefaultOptionKeys.
|
|
105
|
+
context[DefaultOptionKeys.in_features] = frozenset(item.mloda_sources)
|
|
108
106
|
options = Options(group=item.group_options or {}, context=context)
|
|
109
107
|
feature = Feature(name=feature_name, options=options)
|
|
110
108
|
features.append(feature)
|
|
@@ -115,9 +113,7 @@ def load_features_from_config(config_str: str, format: str = "json") -> List[Uni
|
|
|
115
113
|
processed_options = process_nested_features(item.options)
|
|
116
114
|
# Always convert to frozenset for consistency (even single items)
|
|
117
115
|
source_value = frozenset(item.mloda_sources)
|
|
118
|
-
options = Options(
|
|
119
|
-
group=processed_options, context={DefaultOptionKeys.mloda_source_features: source_value}
|
|
120
|
-
)
|
|
116
|
+
options = Options(group=processed_options, context={DefaultOptionKeys.in_features: source_value})
|
|
121
117
|
feature = Feature(name=feature_name, options=options)
|
|
122
118
|
features.append(feature)
|
|
123
119
|
feature_registry[feature_name] = feature
|
|
@@ -133,16 +129,14 @@ def load_features_from_config(config_str: str, format: str = "json") -> List[Uni
|
|
|
133
129
|
# Pass 2: Resolve @feature_name references to Feature objects
|
|
134
130
|
for feat in features:
|
|
135
131
|
if isinstance(feat, Feature):
|
|
136
|
-
mloda_source = feat.options.context.get(DefaultOptionKeys.
|
|
132
|
+
mloda_source = feat.options.context.get(DefaultOptionKeys.in_features)
|
|
137
133
|
if mloda_source:
|
|
138
134
|
# Handle both single string and frozenset of strings
|
|
139
135
|
if isinstance(mloda_source, str) and mloda_source.startswith("@"):
|
|
140
136
|
# Single reference string
|
|
141
137
|
referenced_name = mloda_source[1:]
|
|
142
138
|
if referenced_name in feature_registry:
|
|
143
|
-
feat.options.context[DefaultOptionKeys.
|
|
144
|
-
referenced_name
|
|
145
|
-
]
|
|
139
|
+
feat.options.context[DefaultOptionKeys.in_features] = feature_registry[referenced_name]
|
|
146
140
|
else:
|
|
147
141
|
raise ValueError(f"Feature reference '@{referenced_name}' not found in configuration")
|
|
148
142
|
elif isinstance(mloda_source, frozenset):
|
|
@@ -159,6 +153,6 @@ def load_features_from_config(config_str: str, format: str = "json") -> List[Uni
|
|
|
159
153
|
resolved_sources.append(source)
|
|
160
154
|
# Only replace if we actually resolved any references
|
|
161
155
|
if any(isinstance(s, str) and s.startswith("@") for s in mloda_source):
|
|
162
|
-
feat.options.context[DefaultOptionKeys.
|
|
156
|
+
feat.options.context[DefaultOptionKeys.in_features] = frozenset(resolved_sources)
|
|
163
157
|
|
|
164
158
|
return features
|
|
@@ -4,6 +4,7 @@ Base implementation for aggregated feature groups.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
+
from abc import abstractmethod
|
|
7
8
|
from typing import Any, List, Optional, Set, Union
|
|
8
9
|
|
|
9
10
|
from mloda_core.abstract_plugins.abstract_feature_group import AbstractFeatureGroup
|
|
@@ -40,7 +41,7 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
40
41
|
|
|
41
42
|
### 1. String-Based Creation
|
|
42
43
|
|
|
43
|
-
Features follow the naming pattern: `{
|
|
44
|
+
Features follow the naming pattern: `{in_features}__{aggregation_type}_aggr`
|
|
44
45
|
|
|
45
46
|
Examples:
|
|
46
47
|
```python
|
|
@@ -62,7 +63,7 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
62
63
|
options=Options(
|
|
63
64
|
context={
|
|
64
65
|
AggregatedFeatureGroup.AGGREGATION_TYPE: "sum",
|
|
65
|
-
DefaultOptionKeys.
|
|
66
|
+
DefaultOptionKeys.in_features: "sales",
|
|
66
67
|
}
|
|
67
68
|
)
|
|
68
69
|
)
|
|
@@ -73,7 +74,7 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
73
74
|
### Context Parameters (Default)
|
|
74
75
|
These parameters don't affect Feature Group resolution/splitting:
|
|
75
76
|
- `aggregation_type`: The type of aggregation to perform
|
|
76
|
-
- `
|
|
77
|
+
- `in_features`: The source feature to aggregate
|
|
77
78
|
|
|
78
79
|
### Group Parameters
|
|
79
80
|
Currently none for AggregatedFeatureGroup. Parameters that affect Feature Group
|
|
@@ -96,7 +97,6 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
96
97
|
"median": "Median value",
|
|
97
98
|
}
|
|
98
99
|
|
|
99
|
-
PATTERN = "__"
|
|
100
100
|
PREFIX_PATTERN = r".*__([\w]+)_aggr$"
|
|
101
101
|
|
|
102
102
|
# Property mapping for configuration-based feature creation
|
|
@@ -106,7 +106,7 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
106
106
|
DefaultOptionKeys.mloda_context: True, # Mark as context parameter
|
|
107
107
|
DefaultOptionKeys.mloda_strict_validation: True, # Enable strict validation
|
|
108
108
|
},
|
|
109
|
-
DefaultOptionKeys.
|
|
109
|
+
DefaultOptionKeys.in_features: {
|
|
110
110
|
"explanation": "Source feature to aggregate",
|
|
111
111
|
DefaultOptionKeys.mloda_context: True, # Mark as context parameter
|
|
112
112
|
DefaultOptionKeys.mloda_strict_validation: False, # Flexible validation
|
|
@@ -119,12 +119,12 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
119
119
|
source_feature: str | None = None
|
|
120
120
|
|
|
121
121
|
# string based
|
|
122
|
-
_, source_feature = FeatureChainParser.parse_feature_name(feature_name,
|
|
122
|
+
_, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
|
|
123
123
|
if source_feature is not None:
|
|
124
124
|
return {Feature(source_feature)}
|
|
125
125
|
|
|
126
126
|
# configuration based
|
|
127
|
-
source_features = options.
|
|
127
|
+
source_features = options.get_in_features()
|
|
128
128
|
if len(source_features) != 1:
|
|
129
129
|
raise ValueError(
|
|
130
130
|
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
@@ -134,7 +134,7 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
134
134
|
@classmethod
|
|
135
135
|
def get_aggregation_type(cls, feature_name: str) -> str:
|
|
136
136
|
"""Extract the aggregation type from the feature name."""
|
|
137
|
-
prefix_part, _ = FeatureChainParser.parse_feature_name(feature_name,
|
|
137
|
+
prefix_part, _ = FeatureChainParser.parse_feature_name(feature_name, [cls.PREFIX_PATTERN])
|
|
138
138
|
if prefix_part is None:
|
|
139
139
|
raise ValueError(f"Could not extract aggregation type from feature name: {feature_name}")
|
|
140
140
|
return prefix_part
|
|
@@ -153,7 +153,6 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
153
153
|
feature_name,
|
|
154
154
|
options,
|
|
155
155
|
property_mapping=cls.PROPERTY_MAPPING,
|
|
156
|
-
pattern=cls.PATTERN,
|
|
157
156
|
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
158
157
|
)
|
|
159
158
|
|
|
@@ -178,13 +177,13 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
178
177
|
|
|
179
178
|
# string based
|
|
180
179
|
aggregation_type, source_feature_name = FeatureChainParser.parse_feature_name(
|
|
181
|
-
feature.name,
|
|
180
|
+
feature.name, [cls.PREFIX_PATTERN]
|
|
182
181
|
)
|
|
183
182
|
if aggregation_type is not None and source_feature_name is not None:
|
|
184
183
|
return aggregation_type, source_feature_name
|
|
185
184
|
|
|
186
185
|
# configuration based
|
|
187
|
-
source_features = feature.options.
|
|
186
|
+
source_features = feature.options.get_in_features()
|
|
188
187
|
source_feature = next(iter(source_features))
|
|
189
188
|
source_feature_name = source_feature.get_name()
|
|
190
189
|
|
|
@@ -243,6 +242,7 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
243
242
|
return data
|
|
244
243
|
|
|
245
244
|
@classmethod
|
|
245
|
+
@abstractmethod
|
|
246
246
|
def _get_available_columns(cls, data: Any) -> Set[str]:
|
|
247
247
|
"""
|
|
248
248
|
Get the set of available column names from the data.
|
|
@@ -253,9 +253,10 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
253
253
|
Returns:
|
|
254
254
|
Set of column names available in the data
|
|
255
255
|
"""
|
|
256
|
-
|
|
256
|
+
...
|
|
257
257
|
|
|
258
258
|
@classmethod
|
|
259
|
+
@abstractmethod
|
|
259
260
|
def _check_source_features_exist(cls, data: Any, feature_names: List[str]) -> None:
|
|
260
261
|
"""
|
|
261
262
|
Check if the resolved source features exist in the data.
|
|
@@ -267,9 +268,10 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
267
268
|
Raises:
|
|
268
269
|
ValueError: If none of the features exist in the data
|
|
269
270
|
"""
|
|
270
|
-
|
|
271
|
+
...
|
|
271
272
|
|
|
272
273
|
@classmethod
|
|
274
|
+
@abstractmethod
|
|
273
275
|
def _add_result_to_data(cls, data: Any, feature_name: str, result: Any) -> Any:
|
|
274
276
|
"""
|
|
275
277
|
Add the result to the data.
|
|
@@ -282,10 +284,11 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
282
284
|
Returns:
|
|
283
285
|
The updated data
|
|
284
286
|
"""
|
|
285
|
-
|
|
287
|
+
...
|
|
286
288
|
|
|
287
289
|
@classmethod
|
|
288
|
-
|
|
290
|
+
@abstractmethod
|
|
291
|
+
def _perform_aggregation(cls, data: Any, aggregation_type: str, in_features: List[str]) -> Any:
|
|
289
292
|
"""
|
|
290
293
|
Method to perform the aggregation. Should be implemented by subclasses.
|
|
291
294
|
|
|
@@ -296,9 +299,9 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
296
299
|
Args:
|
|
297
300
|
data: The input data
|
|
298
301
|
aggregation_type: The type of aggregation to perform
|
|
299
|
-
|
|
302
|
+
in_features: List of resolved source feature names to aggregate
|
|
300
303
|
|
|
301
304
|
Returns:
|
|
302
305
|
The result of the aggregation
|
|
303
306
|
"""
|
|
304
|
-
|
|
307
|
+
...
|