mloda 0.3.3__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +45 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -47
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +15 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/METADATA +24 -31
- mloda-0.4.1.dist-info/RECORD +248 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -11
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +2 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -64
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +67 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -82
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +81 -96
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +108 -106
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +52 -44
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -3
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -74
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +53 -53
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +3 -4
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -60
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -3
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -63
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +108 -95
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.3.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/WHEEL +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.1.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -5,20 +5,21 @@ Base implementation for scikit-learn scaling feature groups.
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
7
|
import datetime
|
|
8
|
-
from typing import Any, Dict, Optional,
|
|
9
|
-
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
|
|
16
|
-
|
|
8
|
+
from typing import Any, Dict, Optional, Type
|
|
9
|
+
|
|
10
|
+
from mloda import FeatureGroup
|
|
11
|
+
from mloda import Feature
|
|
12
|
+
from mloda.provider import FeatureSet
|
|
13
|
+
from mloda.provider import FeatureChainParser
|
|
14
|
+
from mloda.provider import (
|
|
15
|
+
FeatureChainParserMixin,
|
|
16
|
+
)
|
|
17
|
+
from mloda.provider import BaseArtifact
|
|
17
18
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
18
19
|
from mloda_plugins.feature_group.experimental.sklearn.sklearn_artifact import SklearnArtifact
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
class ScalingFeatureGroup(
|
|
22
|
+
class ScalingFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
22
23
|
"""
|
|
23
24
|
Base class for scikit-learn scaling feature groups.
|
|
24
25
|
|
|
@@ -82,17 +83,21 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
82
83
|
PATTERN = "__"
|
|
83
84
|
PREFIX_PATTERN = r".*__(standard|minmax|robust|normalizer)_scaled$"
|
|
84
85
|
|
|
86
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
87
|
+
MIN_IN_FEATURES = 1
|
|
88
|
+
MAX_IN_FEATURES = 1
|
|
89
|
+
|
|
85
90
|
# Property mapping for new configuration-based approach
|
|
86
91
|
PROPERTY_MAPPING = {
|
|
87
92
|
SCALER_TYPE: {
|
|
88
93
|
**SUPPORTED_SCALERS, # All supported scaler types as valid options
|
|
89
|
-
DefaultOptionKeys.
|
|
90
|
-
DefaultOptionKeys.
|
|
94
|
+
DefaultOptionKeys.context: True, # Context parameter
|
|
95
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
91
96
|
},
|
|
92
97
|
DefaultOptionKeys.in_features: {
|
|
93
98
|
"explanation": "Source feature to scale",
|
|
94
|
-
DefaultOptionKeys.
|
|
95
|
-
DefaultOptionKeys.
|
|
99
|
+
DefaultOptionKeys.context: True, # Context parameter
|
|
100
|
+
DefaultOptionKeys.strict_validation: False, # Flexible validation
|
|
96
101
|
},
|
|
97
102
|
}
|
|
98
103
|
|
|
@@ -101,22 +106,6 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
101
106
|
"""Return the artifact class for sklearn scaler persistence."""
|
|
102
107
|
return SklearnArtifact
|
|
103
108
|
|
|
104
|
-
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
|
|
105
|
-
"""Extract source feature from either configuration-based options or string parsing."""
|
|
106
|
-
|
|
107
|
-
# Try string-based parsing first
|
|
108
|
-
_, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
|
|
109
|
-
if source_feature is not None:
|
|
110
|
-
return {Feature(source_feature)}
|
|
111
|
-
|
|
112
|
-
# Fall back to configuration-based approach
|
|
113
|
-
source_features = options.get_in_features()
|
|
114
|
-
if len(source_features) != 1:
|
|
115
|
-
raise ValueError(
|
|
116
|
-
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
117
|
-
)
|
|
118
|
-
return set(source_features)
|
|
119
|
-
|
|
120
109
|
@classmethod
|
|
121
110
|
def get_scaler_type(cls, feature_name: str) -> str:
|
|
122
111
|
"""Extract the scaler type from the feature name."""
|
|
@@ -133,22 +122,6 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
133
122
|
|
|
134
123
|
return scaler_type
|
|
135
124
|
|
|
136
|
-
@classmethod
|
|
137
|
-
def match_feature_group_criteria(
|
|
138
|
-
cls,
|
|
139
|
-
feature_name: Union[FeatureName, str],
|
|
140
|
-
options: Options,
|
|
141
|
-
data_access_collection: Optional[Any] = None,
|
|
142
|
-
) -> bool:
|
|
143
|
-
"""Check if feature name matches the expected pattern using unified parser."""
|
|
144
|
-
# Use the unified parser with property mapping for full configuration support
|
|
145
|
-
return FeatureChainParser.match_configuration_feature_chain_parser(
|
|
146
|
-
feature_name,
|
|
147
|
-
options,
|
|
148
|
-
property_mapping=cls.PROPERTY_MAPPING,
|
|
149
|
-
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
150
|
-
)
|
|
151
|
-
|
|
152
125
|
@classmethod
|
|
153
126
|
def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
|
|
154
127
|
"""
|
|
@@ -213,33 +186,44 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
213
186
|
Raises:
|
|
214
187
|
ValueError: If parameters cannot be extracted
|
|
215
188
|
"""
|
|
216
|
-
|
|
217
|
-
|
|
189
|
+
source_features = cls._extract_source_features(feature)
|
|
190
|
+
scaler_type = cls._extract_scaler_type(feature)
|
|
191
|
+
if scaler_type is None:
|
|
192
|
+
raise ValueError(f"Could not extract scaler type from: {feature.name}")
|
|
193
|
+
return scaler_type, source_features[0]
|
|
218
194
|
|
|
219
|
-
|
|
195
|
+
@classmethod
|
|
196
|
+
def _extract_scaler_type(cls, feature: Feature) -> Optional[str]:
|
|
197
|
+
"""
|
|
198
|
+
Extract scaler type from a feature.
|
|
199
|
+
|
|
200
|
+
Tries string-based parsing first, falls back to configuration-based approach.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
feature: The feature to extract scaler type from
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
The scaler type string
|
|
207
|
+
|
|
208
|
+
Raises:
|
|
209
|
+
ValueError: If scaler type is unsupported
|
|
210
|
+
"""
|
|
220
211
|
feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
|
|
221
212
|
|
|
213
|
+
# Try string-based parsing first
|
|
222
214
|
if FeatureChainParser.is_chained_feature(feature_name_str):
|
|
223
215
|
scaler_type = cls.get_scaler_type(feature_name_str)
|
|
224
|
-
|
|
225
|
-
return scaler_type, source_feature_name
|
|
216
|
+
return scaler_type
|
|
226
217
|
|
|
227
218
|
# Fall back to configuration-based approach
|
|
228
|
-
source_features = feature.options.get_in_features()
|
|
229
|
-
source_feature = next(iter(source_features))
|
|
230
|
-
source_feature_name = source_feature.get_name()
|
|
231
|
-
|
|
232
219
|
scaler_type = feature.options.get(cls.SCALER_TYPE)
|
|
233
220
|
|
|
234
|
-
if scaler_type is None
|
|
235
|
-
raise ValueError(f"Could not extract scaler type and source feature from: {feature.name}")
|
|
236
|
-
|
|
237
|
-
if scaler_type not in cls.SUPPORTED_SCALERS:
|
|
221
|
+
if scaler_type is not None and scaler_type not in cls.SUPPORTED_SCALERS:
|
|
238
222
|
raise ValueError(
|
|
239
223
|
f"Unsupported scaler type: {scaler_type}. Supported types: {', '.join(cls.SUPPORTED_SCALERS.keys())}"
|
|
240
224
|
)
|
|
241
225
|
|
|
242
|
-
return scaler_type
|
|
226
|
+
return str(scaler_type) if scaler_type is not None else None
|
|
243
227
|
|
|
244
228
|
@classmethod
|
|
245
229
|
def _import_sklearn_components(cls) -> Dict[str, Any]:
|
|
@@ -6,7 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, Set, Type, Union
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from mloda import ComputeFramework
|
|
10
10
|
|
|
11
11
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
12
12
|
from mloda_plugins.feature_group.experimental.sklearn.scaling.base import ScalingFeatureGroup
|
|
@@ -21,7 +21,7 @@ class PandasScalingFeatureGroup(ScalingFeatureGroup):
|
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
@classmethod
|
|
24
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
24
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
25
25
|
"""Specify that this feature group works with Pandas."""
|
|
26
26
|
return {PandasDataFrame}
|
|
27
27
|
|
|
@@ -4,14 +4,13 @@ Artifact for storing fitted scikit-learn transformers and estimators.
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import base64
|
|
7
|
-
import os
|
|
8
7
|
import hashlib
|
|
9
8
|
import tempfile
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from typing import Any, Dict, Optional
|
|
12
11
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
12
|
+
from mloda.provider import BaseArtifact
|
|
13
|
+
from mloda.provider import FeatureSet
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
class SklearnArtifact(BaseArtifact):
|
|
@@ -39,16 +39,16 @@ Further, it allows defining:
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
from typing import Any, Dict, NamedTuple, Optional, Set, Tuple, Type, Union
|
|
42
|
-
from
|
|
43
|
-
from
|
|
44
|
-
from
|
|
45
|
-
from
|
|
46
|
-
from
|
|
47
|
-
from
|
|
42
|
+
from mloda import FeatureGroup
|
|
43
|
+
from mloda import Feature
|
|
44
|
+
from mloda.user import FeatureName
|
|
45
|
+
from mloda.user import Index
|
|
46
|
+
from mloda.user import JoinType, Link, JoinSpec
|
|
47
|
+
from mloda import Options
|
|
48
48
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
class SourceInputFeature(
|
|
51
|
+
class SourceInputFeature(FeatureGroup):
|
|
52
52
|
"""
|
|
53
53
|
This feature group focuses on defining input features, especially when they originate
|
|
54
54
|
from other sources or require joins/merges.
|
|
@@ -100,19 +100,19 @@ class SourceTuple(NamedTuple):
|
|
|
100
100
|
|
|
101
101
|
Attributes:
|
|
102
102
|
feature_name: The name of the feature.
|
|
103
|
-
source_class: (Optional) The source class of the feature, can be an `
|
|
103
|
+
source_class: (Optional) The source class of the feature, can be an `FeatureGroup` class or a `str` representing a scope.
|
|
104
104
|
source_value: (Optional) The value associated with the source class, if applicable.
|
|
105
|
-
left_link: (Optional) A tuple containing the left-side `
|
|
106
|
-
right_link: (Optional) A tuple containing the right-side `
|
|
105
|
+
left_link: (Optional) A tuple containing the left-side `FeatureGroup` class and index for join operations.
|
|
106
|
+
right_link: (Optional) A tuple containing the right-side `FeatureGroup` class and index for join operations.
|
|
107
107
|
join_type: (Optional) The type of join operation (`JoinType`).
|
|
108
108
|
merge_index: (Optional) The index to use for merge operations.
|
|
109
109
|
"""
|
|
110
110
|
|
|
111
111
|
feature_name: str
|
|
112
|
-
source_class: Optional[Type[Union[
|
|
112
|
+
source_class: Optional[Type[Union[FeatureGroup, str]]] = None
|
|
113
113
|
source_value: Optional[str] = None
|
|
114
|
-
left_link: Optional[Tuple[Type[
|
|
115
|
-
right_link: Optional[Tuple[Type[
|
|
114
|
+
left_link: Optional[Tuple[Type[FeatureGroup], Union[str, Index]]] = None
|
|
115
|
+
right_link: Optional[Tuple[Type[FeatureGroup], Union[str, Index]]] = None
|
|
116
116
|
join_type: Optional[JoinType] = None
|
|
117
117
|
merge_index: Optional[Union[str, Index]] = None
|
|
118
118
|
|
|
@@ -207,8 +207,8 @@ class SourceInputFeatureComposite:
|
|
|
207
207
|
@classmethod
|
|
208
208
|
def _handle_link(
|
|
209
209
|
cls,
|
|
210
|
-
left_link: Tuple[Type[
|
|
211
|
-
right_link: Tuple[Type[
|
|
210
|
+
left_link: Tuple[Type[FeatureGroup], Union[str, Index]],
|
|
211
|
+
right_link: Tuple[Type[FeatureGroup], Union[str, Index]],
|
|
212
212
|
join_type: Any,
|
|
213
213
|
) -> Link:
|
|
214
214
|
"""
|
|
@@ -4,18 +4,19 @@ Base implementation for text cleaning feature groups.
|
|
|
4
4
|
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
from typing import Any, Optional
|
|
8
|
-
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
from mloda import FeatureGroup
|
|
10
|
+
from mloda import Feature
|
|
11
|
+
from mloda.provider import FeatureChainParser
|
|
12
|
+
from mloda.provider import (
|
|
13
|
+
FeatureChainParserMixin,
|
|
14
|
+
)
|
|
15
|
+
from mloda.provider import FeatureSet
|
|
15
16
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
16
17
|
|
|
17
18
|
|
|
18
|
-
class TextCleaningFeatureGroup(
|
|
19
|
+
class TextCleaningFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
19
20
|
# Option key for the list of operations
|
|
20
21
|
CLEANING_OPERATIONS = "cleaning_operations"
|
|
21
22
|
|
|
@@ -33,13 +34,17 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
33
34
|
PATTERN = "__"
|
|
34
35
|
PREFIX_PATTERN = r".*__cleaned_text$"
|
|
35
36
|
|
|
37
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
38
|
+
MIN_IN_FEATURES = 1
|
|
39
|
+
MAX_IN_FEATURES = 1
|
|
40
|
+
|
|
36
41
|
# Property mapping for configuration-based features
|
|
37
42
|
PROPERTY_MAPPING = {
|
|
38
43
|
CLEANING_OPERATIONS: {
|
|
39
44
|
**SUPPORTED_OPERATIONS, # All supported operations as valid options
|
|
40
|
-
DefaultOptionKeys.
|
|
41
|
-
DefaultOptionKeys.
|
|
42
|
-
DefaultOptionKeys.
|
|
45
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
46
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
47
|
+
DefaultOptionKeys.validation_function: lambda operations: (
|
|
43
48
|
# Handle both actual tuples/lists and string representations
|
|
44
49
|
(
|
|
45
50
|
isinstance(operations, (tuple, list))
|
|
@@ -59,7 +64,7 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
59
64
|
},
|
|
60
65
|
DefaultOptionKeys.in_features: {
|
|
61
66
|
"explanation": "Source feature to apply text cleaning operations to",
|
|
62
|
-
DefaultOptionKeys.
|
|
67
|
+
DefaultOptionKeys.context: True,
|
|
63
68
|
},
|
|
64
69
|
}
|
|
65
70
|
|
|
@@ -115,41 +120,6 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
115
120
|
- The source feature must contain text data
|
|
116
121
|
"""
|
|
117
122
|
|
|
118
|
-
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
|
|
119
|
-
"""Extract source feature from either configuration-based options or string parsing."""
|
|
120
|
-
|
|
121
|
-
source_feature: str | None = None
|
|
122
|
-
|
|
123
|
-
# Try string-based parsing first
|
|
124
|
-
_, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
|
|
125
|
-
if source_feature is not None:
|
|
126
|
-
return {Feature(source_feature)}
|
|
127
|
-
|
|
128
|
-
# Fall back to configuration-based approach
|
|
129
|
-
source_features = options.get_in_features()
|
|
130
|
-
if len(source_features) != 1:
|
|
131
|
-
raise ValueError(
|
|
132
|
-
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
133
|
-
)
|
|
134
|
-
return set(source_features)
|
|
135
|
-
|
|
136
|
-
@classmethod
|
|
137
|
-
def match_feature_group_criteria(
|
|
138
|
-
cls,
|
|
139
|
-
feature_name: Union[FeatureName, str],
|
|
140
|
-
options: Options,
|
|
141
|
-
data_access_collection: Optional[Any] = None,
|
|
142
|
-
) -> bool:
|
|
143
|
-
"""Check if feature name matches the expected pattern for text cleaning features."""
|
|
144
|
-
|
|
145
|
-
# Use the unified parser with property mapping for full configuration support
|
|
146
|
-
return FeatureChainParser.match_configuration_feature_chain_parser(
|
|
147
|
-
feature_name,
|
|
148
|
-
options,
|
|
149
|
-
property_mapping=cls.PROPERTY_MAPPING,
|
|
150
|
-
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
151
|
-
)
|
|
152
|
-
|
|
153
123
|
@classmethod
|
|
154
124
|
def _extract_operations_and_source_feature(cls, feature: Feature) -> tuple[tuple[Any, Any], str]:
|
|
155
125
|
"""
|
|
@@ -166,31 +136,36 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
166
136
|
Raises:
|
|
167
137
|
ValueError: If parameters cannot be extracted
|
|
168
138
|
"""
|
|
169
|
-
|
|
170
|
-
|
|
139
|
+
source_features = cls._extract_source_features(feature)
|
|
140
|
+
operations = cls._extract_cleaning_operations(feature)
|
|
141
|
+
if operations is None:
|
|
142
|
+
raise ValueError(f"Could not extract operations from: {feature.name}")
|
|
143
|
+
return operations, source_features[0]
|
|
171
144
|
|
|
145
|
+
@classmethod
|
|
146
|
+
def _extract_cleaning_operations(cls, feature: Feature) -> Optional[tuple[Any, Any]]:
|
|
147
|
+
"""
|
|
148
|
+
Extract cleaning operations from a feature.
|
|
149
|
+
|
|
150
|
+
Tries string-based parsing first, falls back to configuration-based approach.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
feature: The feature to extract operations from
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Tuple of cleaning operations, or None if not found
|
|
157
|
+
"""
|
|
172
158
|
# Try string-based parsing first
|
|
173
159
|
feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
|
|
174
160
|
|
|
175
161
|
if FeatureChainParser.is_chained_feature(feature_name_str):
|
|
176
|
-
_, source_feature_name = FeatureChainParser.parse_feature_name(feature_name_str, [cls.PREFIX_PATTERN])
|
|
177
162
|
# For string-based features, get operations from options
|
|
178
163
|
operations = feature.options.get(cls.CLEANING_OPERATIONS) or ()
|
|
179
|
-
|
|
180
|
-
raise ValueError(f"Could not extract source feature from string-based feature: {feature.name}")
|
|
181
|
-
return operations, source_feature_name # type: ignore
|
|
164
|
+
return operations # type: ignore
|
|
182
165
|
|
|
183
166
|
# Fall back to configuration-based approach
|
|
184
|
-
source_features = feature.options.get_in_features()
|
|
185
|
-
source_feature = next(iter(source_features))
|
|
186
|
-
source_feature_name = source_feature.get_name()
|
|
187
|
-
|
|
188
167
|
operations = feature.options.get(cls.CLEANING_OPERATIONS)
|
|
189
|
-
|
|
190
|
-
if operations is None or source_feature_name is None:
|
|
191
|
-
raise ValueError(f"Could not extract cleaning operations and source feature from: {feature.name}")
|
|
192
|
-
|
|
193
|
-
return operations, source_feature_name
|
|
168
|
+
return operations if operations is not None else None
|
|
194
169
|
|
|
195
170
|
@classmethod
|
|
196
171
|
def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
|
|
@@ -25,7 +25,7 @@ except ImportError:
|
|
|
25
25
|
pd = None
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
from
|
|
28
|
+
from mloda import ComputeFramework
|
|
29
29
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
30
30
|
from mloda_plugins.feature_group.experimental.text_cleaning.base import TextCleaningFeatureGroup
|
|
31
31
|
|
|
@@ -40,7 +40,7 @@ class PandasTextCleaningFeatureGroup(TextCleaningFeatureGroup):
|
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
42
|
@classmethod
|
|
43
|
-
def compute_framework_rule(cls) -> set[type[
|
|
43
|
+
def compute_framework_rule(cls) -> set[type[ComputeFramework]]:
|
|
44
44
|
"""Define the compute framework for this feature group."""
|
|
45
45
|
return {PandasDataFrame}
|
|
46
46
|
|
|
@@ -9,7 +9,7 @@ import string
|
|
|
9
9
|
import unicodedata
|
|
10
10
|
from typing import Any, Dict, List, Set, Type, Union
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from mloda import ComputeFramework
|
|
13
13
|
|
|
14
14
|
from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import PythonDictFramework
|
|
15
15
|
from mloda_plugins.feature_group.experimental.text_cleaning.base import TextCleaningFeatureGroup
|
|
@@ -35,7 +35,7 @@ class PythonDictTextCleaningFeatureGroup(TextCleaningFeatureGroup):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
@classmethod
|
|
38
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
38
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
39
39
|
return {PythonDictFramework}
|
|
40
40
|
|
|
41
41
|
@classmethod
|