mloda 0.3.3__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +46 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -46
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +16 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +11 -31
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/METADATA +18 -22
- mloda-0.4.0.dist-info/RECORD +248 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -10
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -62
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +69 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -79
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +80 -94
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +106 -104
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +50 -42
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -72
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +51 -51
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -58
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -61
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +106 -93
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.3.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/WHEEL +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.3.dist-info → mloda-0.4.0.dist-info}/licenses/NOTICE.md +0 -0
|
@@ -7,18 +7,21 @@ from __future__ import annotations
|
|
|
7
7
|
import datetime
|
|
8
8
|
from typing import Any, Dict, Optional, Set, Type, Union
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
16
|
-
from
|
|
10
|
+
from mloda import FeatureGroup
|
|
11
|
+
from mloda import Feature
|
|
12
|
+
from mloda.user import FeatureName
|
|
13
|
+
from mloda.provider import FeatureSet
|
|
14
|
+
from mloda import Options
|
|
15
|
+
from mloda.provider import FeatureChainParser
|
|
16
|
+
from mloda.provider import (
|
|
17
|
+
FeatureChainParserMixin,
|
|
18
|
+
)
|
|
19
|
+
from mloda.provider import BaseArtifact
|
|
17
20
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
18
21
|
from mloda_plugins.feature_group.experimental.sklearn.sklearn_artifact import SklearnArtifact
|
|
19
22
|
|
|
20
23
|
|
|
21
|
-
class ScalingFeatureGroup(
|
|
24
|
+
class ScalingFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
22
25
|
"""
|
|
23
26
|
Base class for scikit-learn scaling feature groups.
|
|
24
27
|
|
|
@@ -82,17 +85,21 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
82
85
|
PATTERN = "__"
|
|
83
86
|
PREFIX_PATTERN = r".*__(standard|minmax|robust|normalizer)_scaled$"
|
|
84
87
|
|
|
88
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
89
|
+
MIN_IN_FEATURES = 1
|
|
90
|
+
MAX_IN_FEATURES = 1
|
|
91
|
+
|
|
85
92
|
# Property mapping for new configuration-based approach
|
|
86
93
|
PROPERTY_MAPPING = {
|
|
87
94
|
SCALER_TYPE: {
|
|
88
95
|
**SUPPORTED_SCALERS, # All supported scaler types as valid options
|
|
89
|
-
DefaultOptionKeys.
|
|
90
|
-
DefaultOptionKeys.
|
|
96
|
+
DefaultOptionKeys.context: True, # Context parameter
|
|
97
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
91
98
|
},
|
|
92
99
|
DefaultOptionKeys.in_features: {
|
|
93
100
|
"explanation": "Source feature to scale",
|
|
94
|
-
DefaultOptionKeys.
|
|
95
|
-
DefaultOptionKeys.
|
|
101
|
+
DefaultOptionKeys.context: True, # Context parameter
|
|
102
|
+
DefaultOptionKeys.strict_validation: False, # Flexible validation
|
|
96
103
|
},
|
|
97
104
|
}
|
|
98
105
|
|
|
@@ -101,22 +108,6 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
101
108
|
"""Return the artifact class for sklearn scaler persistence."""
|
|
102
109
|
return SklearnArtifact
|
|
103
110
|
|
|
104
|
-
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
|
|
105
|
-
"""Extract source feature from either configuration-based options or string parsing."""
|
|
106
|
-
|
|
107
|
-
# Try string-based parsing first
|
|
108
|
-
_, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
|
|
109
|
-
if source_feature is not None:
|
|
110
|
-
return {Feature(source_feature)}
|
|
111
|
-
|
|
112
|
-
# Fall back to configuration-based approach
|
|
113
|
-
source_features = options.get_in_features()
|
|
114
|
-
if len(source_features) != 1:
|
|
115
|
-
raise ValueError(
|
|
116
|
-
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
117
|
-
)
|
|
118
|
-
return set(source_features)
|
|
119
|
-
|
|
120
111
|
@classmethod
|
|
121
112
|
def get_scaler_type(cls, feature_name: str) -> str:
|
|
122
113
|
"""Extract the scaler type from the feature name."""
|
|
@@ -133,22 +124,6 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
133
124
|
|
|
134
125
|
return scaler_type
|
|
135
126
|
|
|
136
|
-
@classmethod
|
|
137
|
-
def match_feature_group_criteria(
|
|
138
|
-
cls,
|
|
139
|
-
feature_name: Union[FeatureName, str],
|
|
140
|
-
options: Options,
|
|
141
|
-
data_access_collection: Optional[Any] = None,
|
|
142
|
-
) -> bool:
|
|
143
|
-
"""Check if feature name matches the expected pattern using unified parser."""
|
|
144
|
-
# Use the unified parser with property mapping for full configuration support
|
|
145
|
-
return FeatureChainParser.match_configuration_feature_chain_parser(
|
|
146
|
-
feature_name,
|
|
147
|
-
options,
|
|
148
|
-
property_mapping=cls.PROPERTY_MAPPING,
|
|
149
|
-
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
150
|
-
)
|
|
151
|
-
|
|
152
127
|
@classmethod
|
|
153
128
|
def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
|
|
154
129
|
"""
|
|
@@ -213,33 +188,44 @@ class ScalingFeatureGroup(AbstractFeatureGroup):
|
|
|
213
188
|
Raises:
|
|
214
189
|
ValueError: If parameters cannot be extracted
|
|
215
190
|
"""
|
|
216
|
-
|
|
217
|
-
|
|
191
|
+
source_features = cls._extract_source_features(feature)
|
|
192
|
+
scaler_type = cls._extract_scaler_type(feature)
|
|
193
|
+
if scaler_type is None:
|
|
194
|
+
raise ValueError(f"Could not extract scaler type from: {feature.name}")
|
|
195
|
+
return scaler_type, source_features[0]
|
|
218
196
|
|
|
219
|
-
|
|
197
|
+
@classmethod
|
|
198
|
+
def _extract_scaler_type(cls, feature: Feature) -> Optional[str]:
|
|
199
|
+
"""
|
|
200
|
+
Extract scaler type from a feature.
|
|
201
|
+
|
|
202
|
+
Tries string-based parsing first, falls back to configuration-based approach.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
feature: The feature to extract scaler type from
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
The scaler type string
|
|
209
|
+
|
|
210
|
+
Raises:
|
|
211
|
+
ValueError: If scaler type is unsupported
|
|
212
|
+
"""
|
|
220
213
|
feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
|
|
221
214
|
|
|
215
|
+
# Try string-based parsing first
|
|
222
216
|
if FeatureChainParser.is_chained_feature(feature_name_str):
|
|
223
217
|
scaler_type = cls.get_scaler_type(feature_name_str)
|
|
224
|
-
|
|
225
|
-
return scaler_type, source_feature_name
|
|
218
|
+
return scaler_type
|
|
226
219
|
|
|
227
220
|
# Fall back to configuration-based approach
|
|
228
|
-
source_features = feature.options.get_in_features()
|
|
229
|
-
source_feature = next(iter(source_features))
|
|
230
|
-
source_feature_name = source_feature.get_name()
|
|
231
|
-
|
|
232
221
|
scaler_type = feature.options.get(cls.SCALER_TYPE)
|
|
233
222
|
|
|
234
|
-
if scaler_type is None
|
|
235
|
-
raise ValueError(f"Could not extract scaler type and source feature from: {feature.name}")
|
|
236
|
-
|
|
237
|
-
if scaler_type not in cls.SUPPORTED_SCALERS:
|
|
223
|
+
if scaler_type is not None and scaler_type not in cls.SUPPORTED_SCALERS:
|
|
238
224
|
raise ValueError(
|
|
239
225
|
f"Unsupported scaler type: {scaler_type}. Supported types: {', '.join(cls.SUPPORTED_SCALERS.keys())}"
|
|
240
226
|
)
|
|
241
227
|
|
|
242
|
-
return scaler_type
|
|
228
|
+
return str(scaler_type) if scaler_type is not None else None
|
|
243
229
|
|
|
244
230
|
@classmethod
|
|
245
231
|
def _import_sklearn_components(cls) -> Dict[str, Any]:
|
|
@@ -6,7 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, Set, Type, Union
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from mloda import ComputeFramework
|
|
10
10
|
|
|
11
11
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
12
12
|
from mloda_plugins.feature_group.experimental.sklearn.scaling.base import ScalingFeatureGroup
|
|
@@ -21,7 +21,7 @@ class PandasScalingFeatureGroup(ScalingFeatureGroup):
|
|
|
21
21
|
"""
|
|
22
22
|
|
|
23
23
|
@classmethod
|
|
24
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
24
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
25
25
|
"""Specify that this feature group works with Pandas."""
|
|
26
26
|
return {PandasDataFrame}
|
|
27
27
|
|
|
@@ -10,8 +10,8 @@ import tempfile
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from typing import Any, Dict, Optional
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
13
|
+
from mloda.provider import BaseArtifact
|
|
14
|
+
from mloda.provider import FeatureSet
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class SklearnArtifact(BaseArtifact):
|
|
@@ -39,16 +39,16 @@ Further, it allows defining:
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
41
41
|
from typing import Any, Dict, NamedTuple, Optional, Set, Tuple, Type, Union
|
|
42
|
-
from
|
|
43
|
-
from
|
|
44
|
-
from
|
|
45
|
-
from
|
|
46
|
-
from
|
|
47
|
-
from
|
|
42
|
+
from mloda import FeatureGroup
|
|
43
|
+
from mloda import Feature
|
|
44
|
+
from mloda.user import FeatureName
|
|
45
|
+
from mloda.user import Index
|
|
46
|
+
from mloda.user import JoinType, Link, JoinSpec
|
|
47
|
+
from mloda import Options
|
|
48
48
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
class SourceInputFeature(
|
|
51
|
+
class SourceInputFeature(FeatureGroup):
|
|
52
52
|
"""
|
|
53
53
|
This feature group focuses on defining input features, especially when they originate
|
|
54
54
|
from other sources or require joins/merges.
|
|
@@ -100,19 +100,19 @@ class SourceTuple(NamedTuple):
|
|
|
100
100
|
|
|
101
101
|
Attributes:
|
|
102
102
|
feature_name: The name of the feature.
|
|
103
|
-
source_class: (Optional) The source class of the feature, can be an `
|
|
103
|
+
source_class: (Optional) The source class of the feature, can be an `FeatureGroup` class or a `str` representing a scope.
|
|
104
104
|
source_value: (Optional) The value associated with the source class, if applicable.
|
|
105
|
-
left_link: (Optional) A tuple containing the left-side `
|
|
106
|
-
right_link: (Optional) A tuple containing the right-side `
|
|
105
|
+
left_link: (Optional) A tuple containing the left-side `FeatureGroup` class and index for join operations.
|
|
106
|
+
right_link: (Optional) A tuple containing the right-side `FeatureGroup` class and index for join operations.
|
|
107
107
|
join_type: (Optional) The type of join operation (`JoinType`).
|
|
108
108
|
merge_index: (Optional) The index to use for merge operations.
|
|
109
109
|
"""
|
|
110
110
|
|
|
111
111
|
feature_name: str
|
|
112
|
-
source_class: Optional[Type[Union[
|
|
112
|
+
source_class: Optional[Type[Union[FeatureGroup, str]]] = None
|
|
113
113
|
source_value: Optional[str] = None
|
|
114
|
-
left_link: Optional[Tuple[Type[
|
|
115
|
-
right_link: Optional[Tuple[Type[
|
|
114
|
+
left_link: Optional[Tuple[Type[FeatureGroup], Union[str, Index]]] = None
|
|
115
|
+
right_link: Optional[Tuple[Type[FeatureGroup], Union[str, Index]]] = None
|
|
116
116
|
join_type: Optional[JoinType] = None
|
|
117
117
|
merge_index: Optional[Union[str, Index]] = None
|
|
118
118
|
|
|
@@ -207,8 +207,8 @@ class SourceInputFeatureComposite:
|
|
|
207
207
|
@classmethod
|
|
208
208
|
def _handle_link(
|
|
209
209
|
cls,
|
|
210
|
-
left_link: Tuple[Type[
|
|
211
|
-
right_link: Tuple[Type[
|
|
210
|
+
left_link: Tuple[Type[FeatureGroup], Union[str, Index]],
|
|
211
|
+
right_link: Tuple[Type[FeatureGroup], Union[str, Index]],
|
|
212
212
|
join_type: Any,
|
|
213
213
|
) -> Link:
|
|
214
214
|
"""
|
|
@@ -6,16 +6,19 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, Optional, Set, Union
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
|
|
14
|
-
|
|
9
|
+
from mloda import FeatureGroup
|
|
10
|
+
from mloda import Feature
|
|
11
|
+
from mloda.provider import FeatureChainParser
|
|
12
|
+
from mloda.provider import (
|
|
13
|
+
FeatureChainParserMixin,
|
|
14
|
+
)
|
|
15
|
+
from mloda.user import FeatureName
|
|
16
|
+
from mloda.provider import FeatureSet
|
|
17
|
+
from mloda import Options
|
|
15
18
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
16
19
|
|
|
17
20
|
|
|
18
|
-
class TextCleaningFeatureGroup(
|
|
21
|
+
class TextCleaningFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
19
22
|
# Option key for the list of operations
|
|
20
23
|
CLEANING_OPERATIONS = "cleaning_operations"
|
|
21
24
|
|
|
@@ -33,13 +36,17 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
33
36
|
PATTERN = "__"
|
|
34
37
|
PREFIX_PATTERN = r".*__cleaned_text$"
|
|
35
38
|
|
|
39
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
40
|
+
MIN_IN_FEATURES = 1
|
|
41
|
+
MAX_IN_FEATURES = 1
|
|
42
|
+
|
|
36
43
|
# Property mapping for configuration-based features
|
|
37
44
|
PROPERTY_MAPPING = {
|
|
38
45
|
CLEANING_OPERATIONS: {
|
|
39
46
|
**SUPPORTED_OPERATIONS, # All supported operations as valid options
|
|
40
|
-
DefaultOptionKeys.
|
|
41
|
-
DefaultOptionKeys.
|
|
42
|
-
DefaultOptionKeys.
|
|
47
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
48
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
49
|
+
DefaultOptionKeys.validation_function: lambda operations: (
|
|
43
50
|
# Handle both actual tuples/lists and string representations
|
|
44
51
|
(
|
|
45
52
|
isinstance(operations, (tuple, list))
|
|
@@ -59,7 +66,7 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
59
66
|
},
|
|
60
67
|
DefaultOptionKeys.in_features: {
|
|
61
68
|
"explanation": "Source feature to apply text cleaning operations to",
|
|
62
|
-
DefaultOptionKeys.
|
|
69
|
+
DefaultOptionKeys.context: True,
|
|
63
70
|
},
|
|
64
71
|
}
|
|
65
72
|
|
|
@@ -115,41 +122,6 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
115
122
|
- The source feature must contain text data
|
|
116
123
|
"""
|
|
117
124
|
|
|
118
|
-
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
|
|
119
|
-
"""Extract source feature from either configuration-based options or string parsing."""
|
|
120
|
-
|
|
121
|
-
source_feature: str | None = None
|
|
122
|
-
|
|
123
|
-
# Try string-based parsing first
|
|
124
|
-
_, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
|
|
125
|
-
if source_feature is not None:
|
|
126
|
-
return {Feature(source_feature)}
|
|
127
|
-
|
|
128
|
-
# Fall back to configuration-based approach
|
|
129
|
-
source_features = options.get_in_features()
|
|
130
|
-
if len(source_features) != 1:
|
|
131
|
-
raise ValueError(
|
|
132
|
-
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
133
|
-
)
|
|
134
|
-
return set(source_features)
|
|
135
|
-
|
|
136
|
-
@classmethod
|
|
137
|
-
def match_feature_group_criteria(
|
|
138
|
-
cls,
|
|
139
|
-
feature_name: Union[FeatureName, str],
|
|
140
|
-
options: Options,
|
|
141
|
-
data_access_collection: Optional[Any] = None,
|
|
142
|
-
) -> bool:
|
|
143
|
-
"""Check if feature name matches the expected pattern for text cleaning features."""
|
|
144
|
-
|
|
145
|
-
# Use the unified parser with property mapping for full configuration support
|
|
146
|
-
return FeatureChainParser.match_configuration_feature_chain_parser(
|
|
147
|
-
feature_name,
|
|
148
|
-
options,
|
|
149
|
-
property_mapping=cls.PROPERTY_MAPPING,
|
|
150
|
-
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
151
|
-
)
|
|
152
|
-
|
|
153
125
|
@classmethod
|
|
154
126
|
def _extract_operations_and_source_feature(cls, feature: Feature) -> tuple[tuple[Any, Any], str]:
|
|
155
127
|
"""
|
|
@@ -166,31 +138,36 @@ class TextCleaningFeatureGroup(AbstractFeatureGroup):
|
|
|
166
138
|
Raises:
|
|
167
139
|
ValueError: If parameters cannot be extracted
|
|
168
140
|
"""
|
|
169
|
-
|
|
170
|
-
|
|
141
|
+
source_features = cls._extract_source_features(feature)
|
|
142
|
+
operations = cls._extract_cleaning_operations(feature)
|
|
143
|
+
if operations is None:
|
|
144
|
+
raise ValueError(f"Could not extract operations from: {feature.name}")
|
|
145
|
+
return operations, source_features[0]
|
|
171
146
|
|
|
147
|
+
@classmethod
|
|
148
|
+
def _extract_cleaning_operations(cls, feature: Feature) -> Optional[tuple[Any, Any]]:
|
|
149
|
+
"""
|
|
150
|
+
Extract cleaning operations from a feature.
|
|
151
|
+
|
|
152
|
+
Tries string-based parsing first, falls back to configuration-based approach.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
feature: The feature to extract operations from
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Tuple of cleaning operations, or None if not found
|
|
159
|
+
"""
|
|
172
160
|
# Try string-based parsing first
|
|
173
161
|
feature_name_str = feature.name.name if hasattr(feature.name, "name") else str(feature.name)
|
|
174
162
|
|
|
175
163
|
if FeatureChainParser.is_chained_feature(feature_name_str):
|
|
176
|
-
_, source_feature_name = FeatureChainParser.parse_feature_name(feature_name_str, [cls.PREFIX_PATTERN])
|
|
177
164
|
# For string-based features, get operations from options
|
|
178
165
|
operations = feature.options.get(cls.CLEANING_OPERATIONS) or ()
|
|
179
|
-
|
|
180
|
-
raise ValueError(f"Could not extract source feature from string-based feature: {feature.name}")
|
|
181
|
-
return operations, source_feature_name # type: ignore
|
|
166
|
+
return operations # type: ignore
|
|
182
167
|
|
|
183
168
|
# Fall back to configuration-based approach
|
|
184
|
-
source_features = feature.options.get_in_features()
|
|
185
|
-
source_feature = next(iter(source_features))
|
|
186
|
-
source_feature_name = source_feature.get_name()
|
|
187
|
-
|
|
188
169
|
operations = feature.options.get(cls.CLEANING_OPERATIONS)
|
|
189
|
-
|
|
190
|
-
if operations is None or source_feature_name is None:
|
|
191
|
-
raise ValueError(f"Could not extract cleaning operations and source feature from: {feature.name}")
|
|
192
|
-
|
|
193
|
-
return operations, source_feature_name
|
|
170
|
+
return operations if operations is not None else None
|
|
194
171
|
|
|
195
172
|
@classmethod
|
|
196
173
|
def calculate_feature(cls, data: Any, features: FeatureSet) -> Any:
|
|
@@ -25,7 +25,7 @@ except ImportError:
|
|
|
25
25
|
pd = None
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
from
|
|
28
|
+
from mloda import ComputeFramework
|
|
29
29
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
30
30
|
from mloda_plugins.feature_group.experimental.text_cleaning.base import TextCleaningFeatureGroup
|
|
31
31
|
|
|
@@ -40,7 +40,7 @@ class PandasTextCleaningFeatureGroup(TextCleaningFeatureGroup):
|
|
|
40
40
|
"""
|
|
41
41
|
|
|
42
42
|
@classmethod
|
|
43
|
-
def compute_framework_rule(cls) -> set[type[
|
|
43
|
+
def compute_framework_rule(cls) -> set[type[ComputeFramework]]:
|
|
44
44
|
"""Define the compute framework for this feature group."""
|
|
45
45
|
return {PandasDataFrame}
|
|
46
46
|
|
|
@@ -9,7 +9,7 @@ import string
|
|
|
9
9
|
import unicodedata
|
|
10
10
|
from typing import Any, Dict, List, Set, Type, Union
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from mloda import ComputeFramework
|
|
13
13
|
|
|
14
14
|
from mloda_plugins.compute_framework.base_implementations.python_dict.python_dict_framework import PythonDictFramework
|
|
15
15
|
from mloda_plugins.feature_group.experimental.text_cleaning.base import TextCleaningFeatureGroup
|
|
@@ -35,7 +35,7 @@ class PythonDictTextCleaningFeatureGroup(TextCleaningFeatureGroup):
|
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
37
|
@classmethod
|
|
38
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
38
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
39
39
|
return {PythonDictFramework}
|
|
40
40
|
|
|
41
41
|
@classmethod
|