mloda 0.3.2__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mloda/__init__.py +17 -0
- {mloda_core → mloda/core}/abstract_plugins/components/base_artifact.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/base_validator.py +13 -0
- {mloda_core → mloda/core}/abstract_plugins/components/data_access_collection.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/data_types.py +39 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature.py +39 -33
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/feature_chain_parser.py +19 -19
- mloda/core/abstract_plugins/components/feature_chainer/feature_chain_parser_mixin.py +197 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_collection.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/feature_group_version.py +8 -8
- {mloda_core → mloda/core}/abstract_plugins/components/feature_set.py +18 -24
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/cfw_transformer.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/index/add_index_feature.py +4 -4
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/api_input_data_collection.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/base_api_data.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/base_input_data.py +6 -6
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/data_creator.py +3 -3
- mloda/core/abstract_plugins/components/link.py +437 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/match_data.py +3 -3
- {mloda_core → mloda/core}/abstract_plugins/components/merge/base_merge_engine.py +2 -2
- {mloda_core → mloda/core}/abstract_plugins/components/options.py +12 -36
- {mloda_core → mloda/core}/abstract_plugins/components/parallelization_modes.py +1 -1
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/plugin_collector.py +14 -14
- mloda/core/abstract_plugins/components/validators/datatype_validator.py +96 -0
- mloda/core/abstract_plugins/components/validators/feature_set_validator.py +38 -0
- mloda/core/abstract_plugins/components/validators/feature_validator.py +23 -0
- mloda/core/abstract_plugins/components/validators/link_validator.py +79 -0
- mloda/core/abstract_plugins/components/validators/options_validator.py +57 -0
- mloda_core/abstract_plugins/compute_frame_work.py → mloda/core/abstract_plugins/compute_framework.py +46 -37
- mloda_core/abstract_plugins/abstract_feature_group.py → mloda/core/abstract_plugins/feature_group.py +56 -33
- mloda/core/abstract_plugins/function_extender.py +78 -0
- mloda/core/api/plugin_docs.py +220 -0
- mloda/core/api/plugin_info.py +32 -0
- {mloda_core → mloda/core}/api/prepare/setup_compute_framework.py +11 -11
- {mloda_core → mloda/core}/api/request.py +42 -33
- {mloda_core → mloda/core}/core/cfw_manager.py +8 -8
- {mloda_core → mloda/core}/core/engine.py +47 -46
- {mloda_core → mloda/core}/core/step/abstract_step.py +7 -7
- {mloda_core → mloda/core}/core/step/feature_group_step.py +12 -12
- {mloda_core → mloda/core}/core/step/join_step.py +14 -14
- {mloda_core → mloda/core}/core/step/transform_frame_work_step.py +16 -16
- {mloda_core → mloda/core}/filter/filter_engine.py +1 -1
- {mloda_core → mloda/core}/filter/filter_type_enum.py +1 -1
- {mloda_core → mloda/core}/filter/global_filter.py +23 -23
- {mloda_core → mloda/core}/filter/single_filter.py +6 -6
- {mloda_core → mloda/core}/prepare/accessible_plugins.py +16 -18
- {mloda_core → mloda/core}/prepare/execution_plan.py +65 -39
- {mloda_core → mloda/core}/prepare/graph/build_graph.py +6 -6
- {mloda_core → mloda/core}/prepare/graph/graph.py +1 -1
- {mloda_core → mloda/core}/prepare/graph/properties.py +5 -5
- {mloda_core → mloda/core}/prepare/identify_feature_group.py +12 -14
- {mloda_core → mloda/core}/prepare/joinstep_collection.py +3 -3
- {mloda_core → mloda/core}/prepare/resolve_compute_frameworks.py +6 -6
- {mloda_core → mloda/core}/prepare/resolve_graph.py +11 -11
- {mloda_core → mloda/core}/prepare/resolve_links.py +31 -40
- mloda/core/prepare/validators/resolve_link_validator.py +32 -0
- mloda/core/runtime/compute_framework_executor.py +271 -0
- mloda/core/runtime/data_lifecycle_manager.py +160 -0
- mloda/core/runtime/flight/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/runner_flight_server.py +1 -1
- mloda/core/runtime/run.py +317 -0
- mloda/core/runtime/worker/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/worker/multiprocessing_worker.py +15 -10
- {mloda_core → mloda/core}/runtime/worker/thread_worker.py +2 -2
- mloda/core/runtime/worker_manager.py +96 -0
- mloda/provider/__init__.py +101 -0
- mloda/steward/__init__.py +25 -0
- mloda/user/__init__.py +57 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/METADATA +18 -22
- mloda-0.4.0.dist-info/RECORD +248 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/top_level.txt +1 -1
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_framework.py +15 -13
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/duckdb/duckdb_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_framework.py +12 -10
- mloda_plugins/compute_framework/base_implementations/iceberg/iceberg_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pandas/dataframe.py +18 -16
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_filter_engine.py +36 -13
- mloda_plugins/compute_framework/base_implementations/pandas/pandas_merge_engine.py +7 -7
- mloda_plugins/compute_framework/base_implementations/pandas/pandaspyarrowtransformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/dataframe.py +16 -14
- mloda_plugins/compute_framework/base_implementations/polars/lazy_dataframe.py +13 -12
- mloda_plugins/compute_framework/base_implementations/polars/polars_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/polars/polars_lazy_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/polars/polars_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/polars/polars_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/pyarrow/pyarrow_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/pyarrow/table.py +12 -10
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_framework.py +11 -9
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_pyarrow_transformer.py +1 -1
- mloda_plugins/compute_framework/base_implementations/spark/spark_filter_engine.py +2 -2
- mloda_plugins/compute_framework/base_implementations/spark/spark_framework.py +17 -15
- mloda_plugins/compute_framework/base_implementations/spark/spark_merge_engine.py +3 -3
- mloda_plugins/compute_framework/base_implementations/spark/spark_pyarrow_transformer.py +1 -1
- mloda_plugins/config/feature/loader.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/base.py +45 -62
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/polars_lazy.py +2 -2
- mloda_plugins/feature_group/experimental/aggregated_feature_group/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/clustering/base.py +69 -97
- mloda_plugins/feature_group/experimental/clustering/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/base.py +58 -79
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/pyarrow.py +2 -2
- mloda_plugins/feature_group/experimental/data_quality/missing_value/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/default_options_key.py +16 -19
- mloda_plugins/feature_group/experimental/dimensionality_reduction/base.py +80 -94
- mloda_plugins/feature_group/experimental/dimensionality_reduction/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/dynamic_feature_group_factory/dynamic_feature_group_factory.py +24 -24
- mloda_plugins/feature_group/experimental/forecasting/base.py +106 -104
- mloda_plugins/feature_group/experimental/forecasting/forecasting_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/forecasting/pandas.py +15 -15
- mloda_plugins/feature_group/experimental/geo_distance/base.py +50 -42
- mloda_plugins/feature_group/experimental/geo_distance/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/llm/cli.py +4 -4
- mloda_plugins/feature_group/experimental/llm/cli_features/refactor_git_cached.py +19 -19
- mloda_plugins/feature_group/experimental/llm/installed_packages_feature_group.py +8 -8
- mloda_plugins/feature_group/experimental/llm/list_directory_feature_group.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/claude.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/gemini.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/llm_base_request.py +5 -5
- mloda_plugins/feature_group/experimental/llm/llm_api/openai.py +3 -3
- mloda_plugins/feature_group/experimental/llm/llm_api/request_loop.py +6 -6
- mloda_plugins/feature_group/experimental/llm/llm_file_selector.py +10 -10
- mloda_plugins/feature_group/experimental/llm/tools/tool_collection.py +1 -1
- mloda_plugins/feature_group/experimental/node_centrality/base.py +46 -72
- mloda_plugins/feature_group/experimental/node_centrality/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/encoding/base.py +51 -51
- mloda_plugins/feature_group/experimental/sklearn/encoding/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/pipeline/base.py +52 -39
- mloda_plugins/feature_group/experimental/sklearn/pipeline/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/scaling/base.py +44 -58
- mloda_plugins/feature_group/experimental/sklearn/scaling/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/sklearn/sklearn_artifact.py +2 -2
- mloda_plugins/feature_group/experimental/source_input_feature.py +15 -15
- mloda_plugins/feature_group/experimental/text_cleaning/base.py +38 -61
- mloda_plugins/feature_group/experimental/text_cleaning/pandas.py +2 -2
- mloda_plugins/feature_group/experimental/text_cleaning/python_dict.py +2 -2
- mloda_plugins/feature_group/experimental/time_window/base.py +106 -93
- mloda_plugins/feature_group/experimental/time_window/pandas.py +13 -13
- mloda_plugins/feature_group/experimental/time_window/pyarrow.py +12 -12
- mloda_plugins/feature_group/input_data/api_data/api_data.py +9 -11
- mloda_plugins/feature_group/input_data/read_context_files.py +7 -7
- mloda_plugins/feature_group/input_data/read_db.py +7 -9
- mloda_plugins/feature_group/input_data/read_db_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_dbs/sqlite.py +23 -13
- mloda_plugins/feature_group/input_data/read_file.py +8 -8
- mloda_plugins/feature_group/input_data/read_file_feature.py +4 -4
- mloda_plugins/feature_group/input_data/read_files/csv.py +6 -6
- mloda_plugins/feature_group/input_data/read_files/feather.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/json.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/orc.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/parquet.py +5 -5
- mloda_plugins/feature_group/input_data/read_files/text_file_reader.py +5 -5
- mloda_plugins/function_extender/base_implementations/otel/otel_extender.py +4 -4
- mloda-0.3.2.dist-info/RECORD +0 -230
- mloda_core/abstract_plugins/components/link.py +0 -286
- mloda_core/abstract_plugins/function_extender.py +0 -34
- mloda_core/runtime/run.py +0 -617
- {mloda_core → mloda/core}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/domain.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_chainer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/feature_name.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/framework_transformer/base_transformer.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/hashable_dict.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/index/index.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/api/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/input_data/creator/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/match_data/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/merge/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/plugin_option/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/components/utils.py +0 -0
- {mloda_core/abstract_plugins/plugin_loader → mloda/core/abstract_plugins/components/validators}/__init__.py +0 -0
- {mloda_core/api → mloda/core/abstract_plugins/plugin_loader}/__init__.py +0 -0
- {mloda_core → mloda/core}/abstract_plugins/plugin_loader/plugin_loader.py +0 -0
- {mloda_core/api/prepare → mloda/core/api}/__init__.py +0 -0
- {mloda_core/core → mloda/core/api/prepare}/__init__.py +0 -0
- {mloda_core/core/step → mloda/core/core}/__init__.py +0 -0
- {mloda_core/filter → mloda/core/core/step}/__init__.py +0 -0
- {mloda_core/prepare → mloda/core/filter}/__init__.py +0 -0
- {mloda_core → mloda/core}/filter/filter_parameter.py +0 -0
- {mloda_core/prepare/graph → mloda/core/prepare}/__init__.py +0 -0
- {mloda_core/runtime → mloda/core/prepare/graph}/__init__.py +0 -0
- {mloda_core/runtime/flight → mloda/core/prepare/validators}/__init__.py +0 -0
- {mloda_core/runtime/worker → mloda/core/runtime}/__init__.py +0 -0
- {mloda_core → mloda/core}/runtime/flight/flight_server.py +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/WHEEL +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/entry_points.txt +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/licenses/LICENSE.TXT +0 -0
- {mloda-0.3.2.dist-info → mloda-0.4.0.dist-info}/licenses/NOTICE.md +0 -0
mloda_plugins/compute_framework/base_implementations/python_dict/python_dict_merge_engine.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import Any, Set, Tuple
|
|
2
|
-
from
|
|
3
|
-
from
|
|
4
|
-
from
|
|
2
|
+
from mloda.provider import BaseMergeEngine
|
|
3
|
+
from mloda.user import Index
|
|
4
|
+
from mloda.user import JoinType
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class PythonDictMergeEngine(BaseMergeEngine):
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from typing import Any
|
|
2
|
-
from
|
|
3
|
-
from
|
|
2
|
+
from mloda.provider import BaseFilterEngine
|
|
3
|
+
from mloda.user import SingleFilter
|
|
4
4
|
|
|
5
5
|
try:
|
|
6
6
|
from pyspark.sql import DataFrame
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
from typing import Any, Set, Type, Optional
|
|
3
|
-
from
|
|
3
|
+
from mloda.provider import BaseMergeEngine
|
|
4
4
|
from mloda_plugins.compute_framework.base_implementations.spark.spark_merge_engine import SparkMergeEngine
|
|
5
|
-
from
|
|
6
|
-
from
|
|
7
|
-
from
|
|
5
|
+
from mloda.user import FeatureName
|
|
6
|
+
from mloda import ComputeFramework
|
|
7
|
+
from mloda.provider import BaseFilterEngine
|
|
8
8
|
from mloda_plugins.compute_framework.base_implementations.spark.spark_filter_engine import SparkFilterEngine
|
|
9
9
|
|
|
10
10
|
try:
|
|
@@ -25,8 +25,8 @@ except ImportError:
|
|
|
25
25
|
logger = logging.getLogger(__name__)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
-
class SparkFramework(
|
|
29
|
-
"""Spark framework implementation for
|
|
28
|
+
class SparkFramework(ComputeFramework):
|
|
29
|
+
"""Spark framework implementation for ComputeFramework.
|
|
30
30
|
|
|
31
31
|
This framework leverages Apache Spark for distributed data processing.
|
|
32
32
|
It requires a SparkSession to be provided through the framework connection object.
|
|
@@ -62,11 +62,12 @@ class SparkFramework(ComputeFrameWork):
|
|
|
62
62
|
except ImportError:
|
|
63
63
|
return False
|
|
64
64
|
|
|
65
|
-
@
|
|
66
|
-
def expected_data_framework() -> Any:
|
|
67
|
-
return
|
|
65
|
+
@classmethod
|
|
66
|
+
def expected_data_framework(cls) -> Any:
|
|
67
|
+
return cls.spark_dataframe()
|
|
68
68
|
|
|
69
|
-
|
|
69
|
+
@classmethod
|
|
70
|
+
def merge_engine(cls) -> Type[BaseMergeEngine]:
|
|
70
71
|
return SparkMergeEngine
|
|
71
72
|
|
|
72
73
|
def select_data_by_column_names(self, data: Any, selected_feature_names: Set[FeatureName]) -> Any:
|
|
@@ -78,14 +79,14 @@ class SparkFramework(ComputeFrameWork):
|
|
|
78
79
|
if self.data is not None:
|
|
79
80
|
self.column_names = set(self.data.columns)
|
|
80
81
|
|
|
81
|
-
@
|
|
82
|
-
def spark_dataframe() -> Any:
|
|
82
|
+
@classmethod
|
|
83
|
+
def spark_dataframe(cls) -> Any:
|
|
83
84
|
if DataFrame is None:
|
|
84
85
|
raise ImportError("PySpark is not installed. To be able to use this framework, please install pyspark.")
|
|
85
86
|
return DataFrame
|
|
86
87
|
|
|
87
|
-
@
|
|
88
|
-
def spark_session() -> Any:
|
|
88
|
+
@classmethod
|
|
89
|
+
def spark_session(cls) -> Any:
|
|
89
90
|
if SparkSession is None:
|
|
90
91
|
raise ImportError("PySpark is not installed. To be able to use this framework, please install pyspark.")
|
|
91
92
|
return SparkSession
|
|
@@ -194,5 +195,6 @@ class SparkFramework(ComputeFrameWork):
|
|
|
194
195
|
|
|
195
196
|
raise ValueError(f"Data {type(data)} is not supported by {self.__class__.__name__}")
|
|
196
197
|
|
|
197
|
-
|
|
198
|
+
@classmethod
|
|
199
|
+
def filter_engine(cls) -> Type[BaseFilterEngine]:
|
|
198
200
|
return SparkFilterEngine
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import Any, Tuple
|
|
2
2
|
|
|
3
|
-
from
|
|
4
|
-
from
|
|
5
|
-
from
|
|
3
|
+
from mloda.user import Index
|
|
4
|
+
from mloda.user import JoinType
|
|
5
|
+
from mloda.provider import BaseMergeEngine
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
from pyspark.sql import DataFrame
|
|
@@ -6,8 +6,8 @@ to mloda Feature instances.
|
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
from typing import List, Union, Dict, Any
|
|
9
|
-
from
|
|
10
|
-
from
|
|
9
|
+
from mloda import Feature
|
|
10
|
+
from mloda import Options
|
|
11
11
|
from mloda_plugins.config.feature.parser import parse_json
|
|
12
12
|
from mloda_plugins.config.feature.models import FeatureConfig
|
|
13
13
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
@@ -7,16 +7,19 @@ from __future__ import annotations
|
|
|
7
7
|
from abc import abstractmethod
|
|
8
8
|
from typing import Any, List, Optional, Set, Union
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
10
|
+
from mloda import FeatureGroup
|
|
11
|
+
from mloda import Feature
|
|
12
|
+
from mloda.user import FeatureName
|
|
13
|
+
from mloda.provider import FeatureSet
|
|
14
|
+
from mloda import Options
|
|
15
|
+
from mloda.provider import FeatureChainParser
|
|
16
|
+
from mloda.provider import (
|
|
17
|
+
FeatureChainParserMixin,
|
|
18
|
+
)
|
|
16
19
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
17
20
|
|
|
18
21
|
|
|
19
|
-
class AggregatedFeatureGroup(
|
|
22
|
+
class AggregatedFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
20
23
|
"""
|
|
21
24
|
Base class for all aggregated feature groups.
|
|
22
25
|
|
|
@@ -99,38 +102,24 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
99
102
|
|
|
100
103
|
PREFIX_PATTERN = r".*__([\w]+)_aggr$"
|
|
101
104
|
|
|
105
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
106
|
+
MIN_IN_FEATURES = 1
|
|
107
|
+
MAX_IN_FEATURES = 1
|
|
108
|
+
|
|
102
109
|
# Property mapping for configuration-based feature creation
|
|
103
110
|
PROPERTY_MAPPING = {
|
|
104
111
|
AGGREGATION_TYPE: {
|
|
105
112
|
**AGGREGATION_TYPES, # All supported aggregation types as valid values
|
|
106
|
-
DefaultOptionKeys.
|
|
107
|
-
DefaultOptionKeys.
|
|
113
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
114
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
108
115
|
},
|
|
109
116
|
DefaultOptionKeys.in_features: {
|
|
110
117
|
"explanation": "Source feature to aggregate",
|
|
111
|
-
DefaultOptionKeys.
|
|
112
|
-
DefaultOptionKeys.
|
|
118
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
119
|
+
DefaultOptionKeys.strict_validation: False, # Flexible validation
|
|
113
120
|
},
|
|
114
121
|
}
|
|
115
122
|
|
|
116
|
-
def input_features(self, options: Options, feature_name: FeatureName) -> Optional[Set[Feature]]:
|
|
117
|
-
"""Extract source feature from either configuration-based options or string parsing."""
|
|
118
|
-
|
|
119
|
-
source_feature: str | None = None
|
|
120
|
-
|
|
121
|
-
# string based
|
|
122
|
-
_, source_feature = FeatureChainParser.parse_feature_name(feature_name, [self.PREFIX_PATTERN])
|
|
123
|
-
if source_feature is not None:
|
|
124
|
-
return {Feature(source_feature)}
|
|
125
|
-
|
|
126
|
-
# configuration based
|
|
127
|
-
source_features = options.get_in_features()
|
|
128
|
-
if len(source_features) != 1:
|
|
129
|
-
raise ValueError(
|
|
130
|
-
f"Expected exactly one source feature, but found {len(source_features)}: {source_features}"
|
|
131
|
-
)
|
|
132
|
-
return set(source_features)
|
|
133
|
-
|
|
134
123
|
@classmethod
|
|
135
124
|
def get_aggregation_type(cls, feature_name: str) -> str:
|
|
136
125
|
"""Extract the aggregation type from the feature name."""
|
|
@@ -140,21 +129,26 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
140
129
|
return prefix_part
|
|
141
130
|
|
|
142
131
|
@classmethod
|
|
143
|
-
def
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
132
|
+
def _extract_aggregation_type(cls, feature: Feature) -> Optional[str]:
|
|
133
|
+
"""
|
|
134
|
+
Extract aggregation type from a feature.
|
|
135
|
+
|
|
136
|
+
Tries string-based parsing first, falls back to configuration.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
feature: The feature to extract aggregation type from
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
The aggregation type, or None if not found
|
|
143
|
+
"""
|
|
144
|
+
# Try string-based parsing first
|
|
145
|
+
aggregation_type, _ = FeatureChainParser.parse_feature_name(feature.name, [cls.PREFIX_PATTERN])
|
|
146
|
+
if aggregation_type is not None:
|
|
147
|
+
return aggregation_type
|
|
148
|
+
|
|
149
|
+
# Fall back to configuration
|
|
150
|
+
aggregation_type = feature.options.get(cls.AGGREGATION_TYPE)
|
|
151
|
+
return str(aggregation_type) if aggregation_type is not None else None
|
|
158
152
|
|
|
159
153
|
@classmethod
|
|
160
154
|
def _extract_aggr_and_source_feature(cls, feature: Feature) -> tuple[str, str]:
|
|
@@ -172,27 +166,16 @@ class AggregatedFeatureGroup(AbstractFeatureGroup):
|
|
|
172
166
|
Raises:
|
|
173
167
|
ValueError: If parameters cannot be extracted
|
|
174
168
|
"""
|
|
175
|
-
|
|
176
|
-
|
|
169
|
+
# Use the mixin method to extract source features
|
|
170
|
+
source_features = cls._extract_source_features(feature)
|
|
177
171
|
|
|
178
|
-
#
|
|
179
|
-
aggregation_type
|
|
180
|
-
feature.name, [cls.PREFIX_PATTERN]
|
|
181
|
-
)
|
|
182
|
-
if aggregation_type is not None and source_feature_name is not None:
|
|
183
|
-
return aggregation_type, source_feature_name
|
|
184
|
-
|
|
185
|
-
# configuration based
|
|
186
|
-
source_features = feature.options.get_in_features()
|
|
187
|
-
source_feature = next(iter(source_features))
|
|
188
|
-
source_feature_name = source_feature.get_name()
|
|
189
|
-
|
|
190
|
-
aggregation_type = feature.options.get(cls.AGGREGATION_TYPE)
|
|
172
|
+
# Extract aggregation type
|
|
173
|
+
aggregation_type = cls._extract_aggregation_type(feature)
|
|
191
174
|
|
|
192
|
-
if aggregation_type is None
|
|
193
|
-
raise ValueError(f"Could not extract aggregation type
|
|
175
|
+
if aggregation_type is None:
|
|
176
|
+
raise ValueError(f"Could not extract aggregation type from: {feature.name}")
|
|
194
177
|
|
|
195
|
-
return aggregation_type,
|
|
178
|
+
return aggregation_type, source_features[0]
|
|
196
179
|
|
|
197
180
|
@classmethod
|
|
198
181
|
def _supports_aggregation_type(cls, aggregation_type: str) -> bool:
|
|
@@ -6,7 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, List, Set, Type, Union
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from mloda import ComputeFramework
|
|
10
10
|
|
|
11
11
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
12
12
|
from mloda_plugins.feature_group.experimental.aggregated_feature_group.base import AggregatedFeatureGroup
|
|
@@ -14,7 +14,7 @@ from mloda_plugins.feature_group.experimental.aggregated_feature_group.base impo
|
|
|
14
14
|
|
|
15
15
|
class PandasAggregatedFeatureGroup(AggregatedFeatureGroup):
|
|
16
16
|
@classmethod
|
|
17
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
17
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
18
18
|
"""Specify that this feature group works with Pandas."""
|
|
19
19
|
return {PandasDataFrame}
|
|
20
20
|
|
|
@@ -6,7 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
from typing import Any, List, Set, Type, Union
|
|
8
8
|
|
|
9
|
-
from
|
|
9
|
+
from mloda import ComputeFramework
|
|
10
10
|
|
|
11
11
|
from mloda_plugins.compute_framework.base_implementations.polars.lazy_dataframe import PolarsLazyDataFrame
|
|
12
12
|
from mloda_plugins.feature_group.experimental.aggregated_feature_group.base import AggregatedFeatureGroup
|
|
@@ -26,7 +26,7 @@ class PolarsLazyAggregatedFeatureGroup(AggregatedFeatureGroup):
|
|
|
26
26
|
"""
|
|
27
27
|
|
|
28
28
|
@classmethod
|
|
29
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
29
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
30
30
|
"""Specify that this feature group works with Polars Lazy DataFrames."""
|
|
31
31
|
return {PolarsLazyDataFrame}
|
|
32
32
|
|
|
@@ -9,7 +9,7 @@ from typing import Any, List, Set, Type, Union
|
|
|
9
9
|
import pyarrow as pa
|
|
10
10
|
import pyarrow.compute as pc
|
|
11
11
|
|
|
12
|
-
from
|
|
12
|
+
from mloda import ComputeFramework
|
|
13
13
|
|
|
14
14
|
from mloda_plugins.compute_framework.base_implementations.pyarrow.table import PyArrowTable
|
|
15
15
|
from mloda_plugins.feature_group.experimental.aggregated_feature_group.base import AggregatedFeatureGroup
|
|
@@ -23,7 +23,7 @@ class PyArrowAggregatedFeatureGroup(AggregatedFeatureGroup):
|
|
|
23
23
|
"""
|
|
24
24
|
|
|
25
25
|
@classmethod
|
|
26
|
-
def compute_framework_rule(cls) -> Union[bool, Set[Type[
|
|
26
|
+
def compute_framework_rule(cls) -> Union[bool, Set[Type[ComputeFramework]]]:
|
|
27
27
|
"""Specify that this feature group works with PyArrow."""
|
|
28
28
|
return {PyArrowTable}
|
|
29
29
|
|
|
@@ -7,16 +7,19 @@ from __future__ import annotations
|
|
|
7
7
|
from abc import abstractmethod
|
|
8
8
|
from typing import Any, List, Optional, Set, Union
|
|
9
9
|
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
13
|
-
from
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
from mloda import FeatureGroup
|
|
11
|
+
from mloda import Feature
|
|
12
|
+
from mloda.provider import FeatureChainParser
|
|
13
|
+
from mloda.provider import (
|
|
14
|
+
FeatureChainParserMixin,
|
|
15
|
+
)
|
|
16
|
+
from mloda.user import FeatureName
|
|
17
|
+
from mloda.provider import FeatureSet
|
|
18
|
+
from mloda import Options
|
|
16
19
|
from mloda_plugins.feature_group.experimental.default_options_key import DefaultOptionKeys
|
|
17
20
|
|
|
18
21
|
|
|
19
|
-
class ClusteringFeatureGroup(
|
|
22
|
+
class ClusteringFeatureGroup(FeatureChainParserMixin, FeatureGroup):
|
|
20
23
|
# Option keys for clustering configuration
|
|
21
24
|
"""
|
|
22
25
|
Base class for all clustering feature groups.
|
|
@@ -105,53 +108,49 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
105
108
|
# Define the prefix pattern for this feature group
|
|
106
109
|
PREFIX_PATTERN = r".*__cluster_([\w]+)_([\w]+)$"
|
|
107
110
|
|
|
111
|
+
# In-feature configuration for FeatureChainParserMixin
|
|
112
|
+
MIN_IN_FEATURES = 1
|
|
113
|
+
MAX_IN_FEATURES = None # Unlimited in_features allowed
|
|
114
|
+
|
|
108
115
|
# Property mapping for configuration-based feature creation
|
|
109
116
|
PROPERTY_MAPPING = {
|
|
110
117
|
ALGORITHM: {
|
|
111
118
|
**CLUSTERING_ALGORITHMS, # All supported algorithms as valid values
|
|
112
|
-
DefaultOptionKeys.
|
|
113
|
-
DefaultOptionKeys.
|
|
119
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
120
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
114
121
|
},
|
|
115
122
|
K_VALUE: {
|
|
116
123
|
"explanation": "Number of clusters or 'auto' for automatic determination",
|
|
117
|
-
DefaultOptionKeys.
|
|
118
|
-
DefaultOptionKeys.
|
|
119
|
-
DefaultOptionKeys.
|
|
124
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
125
|
+
DefaultOptionKeys.strict_validation: True, # Enable strict validation
|
|
126
|
+
DefaultOptionKeys.validation_function: lambda value: value == "auto"
|
|
120
127
|
or (isinstance(value, (int, str)) and str(value).isdigit() and int(value) > 0),
|
|
121
128
|
},
|
|
122
129
|
DefaultOptionKeys.in_features: {
|
|
123
130
|
"explanation": "Source features to use for clustering",
|
|
124
|
-
DefaultOptionKeys.
|
|
125
|
-
DefaultOptionKeys.
|
|
131
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
132
|
+
DefaultOptionKeys.strict_validation: False, # Flexible validation
|
|
126
133
|
},
|
|
127
134
|
OUTPUT_PROBABILITIES: {
|
|
128
135
|
"explanation": "Whether to output cluster probabilities/distances as separate columns using ~N suffix pattern",
|
|
129
|
-
DefaultOptionKeys.
|
|
130
|
-
DefaultOptionKeys.
|
|
131
|
-
DefaultOptionKeys.
|
|
132
|
-
DefaultOptionKeys.
|
|
136
|
+
DefaultOptionKeys.context: True, # Mark as context parameter
|
|
137
|
+
DefaultOptionKeys.strict_validation: False, # Flexible validation
|
|
138
|
+
DefaultOptionKeys.default: False, # Default is False (don't output probabilities)
|
|
139
|
+
DefaultOptionKeys.validation_function: lambda value: isinstance(value, bool),
|
|
133
140
|
},
|
|
134
141
|
}
|
|
135
142
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
source_features.add(Feature(feature.strip()))
|
|
148
|
-
return source_features
|
|
149
|
-
|
|
150
|
-
# configuration based
|
|
151
|
-
source_features_frozen = options.get_in_features()
|
|
152
|
-
if len(source_features_frozen) < 1:
|
|
153
|
-
raise ValueError(f"Feature '{feature_name}' requires at least one source feature, but none were provided.")
|
|
154
|
-
return set(source_features_frozen)
|
|
143
|
+
@classmethod
|
|
144
|
+
def _validate_string_match(cls, feature_name: str, operation_config: str, source_feature: str) -> bool:
|
|
145
|
+
"""Validate clustering-specific string patterns using parse_clustering_prefix()."""
|
|
146
|
+
if FeatureChainParser.is_chained_feature(feature_name):
|
|
147
|
+
try:
|
|
148
|
+
# Use existing validation logic that validates algorithm and k_value
|
|
149
|
+
cls.parse_clustering_prefix(feature_name)
|
|
150
|
+
except ValueError:
|
|
151
|
+
# If validation fails, this feature doesn't match
|
|
152
|
+
return False
|
|
153
|
+
return True
|
|
155
154
|
|
|
156
155
|
@classmethod
|
|
157
156
|
def parse_clustering_prefix(cls, feature_name: str) -> tuple[str, str]:
|
|
@@ -213,41 +212,12 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
213
212
|
k_value = cls.parse_clustering_prefix(feature_name)[1]
|
|
214
213
|
return k_value if k_value == "auto" else int(k_value)
|
|
215
214
|
|
|
216
|
-
|
|
217
|
-
def match_feature_group_criteria(
|
|
218
|
-
cls,
|
|
219
|
-
feature_name: Union[FeatureName, str],
|
|
220
|
-
options: Options,
|
|
221
|
-
data_access_collection: Optional[Any] = None,
|
|
222
|
-
) -> bool:
|
|
223
|
-
"""Check if feature name matches the expected pattern for clustering features."""
|
|
224
|
-
|
|
225
|
-
# Use the unified parser with property mapping for full configuration support
|
|
226
|
-
result = FeatureChainParser.match_configuration_feature_chain_parser(
|
|
227
|
-
feature_name,
|
|
228
|
-
options,
|
|
229
|
-
property_mapping=cls.PROPERTY_MAPPING,
|
|
230
|
-
prefix_patterns=[cls.PREFIX_PATTERN],
|
|
231
|
-
)
|
|
232
|
-
|
|
233
|
-
# If it matches and it's a string-based feature, validate with our custom logic
|
|
234
|
-
if result:
|
|
235
|
-
feature_name_str = feature_name.name if isinstance(feature_name, FeatureName) else feature_name
|
|
236
|
-
|
|
237
|
-
# Check if this is a string-based feature (contains the pattern)
|
|
238
|
-
if FeatureChainParser.is_chained_feature(feature_name_str):
|
|
239
|
-
try:
|
|
240
|
-
# Use existing validation logic that validates algorithm and k_value
|
|
241
|
-
cls.parse_clustering_prefix(feature_name_str)
|
|
242
|
-
except ValueError:
|
|
243
|
-
# If validation fails, this feature doesn't match
|
|
244
|
-
return False
|
|
245
|
-
return result
|
|
215
|
+
# Custom validation done via _validate_string_match() hook
|
|
246
216
|
|
|
247
217
|
@classmethod
|
|
248
|
-
def
|
|
218
|
+
def _extract_clustering_params(cls, feature: Feature) -> tuple[Optional[str], Optional[Union[int, str]]]:
|
|
249
219
|
"""
|
|
250
|
-
Extract algorithm
|
|
220
|
+
Extract algorithm and k_value from a feature.
|
|
251
221
|
|
|
252
222
|
Tries string-based approach first, falls back to configuration-based.
|
|
253
223
|
|
|
@@ -255,47 +225,49 @@ class ClusteringFeatureGroup(AbstractFeatureGroup):
|
|
|
255
225
|
feature: The feature to extract parameters from
|
|
256
226
|
|
|
257
227
|
Returns:
|
|
258
|
-
Tuple of (algorithm, k_value,
|
|
228
|
+
Tuple of (algorithm, k_value) or (None, None) if extraction fails
|
|
259
229
|
|
|
260
230
|
Raises:
|
|
261
|
-
ValueError: If
|
|
231
|
+
ValueError: If string-based parsing fails due to invalid format
|
|
262
232
|
"""
|
|
263
|
-
|
|
264
|
-
k_value: str | int | None = None
|
|
265
|
-
source_features = None
|
|
266
|
-
|
|
267
|
-
# string based
|
|
233
|
+
# Try string-based parsing first
|
|
268
234
|
algorithm_str, source_features_str = FeatureChainParser.parse_feature_name(feature.name, [cls.PREFIX_PATTERN])
|
|
269
235
|
if algorithm_str is not None and source_features_str is not None:
|
|
270
|
-
# Parse the algorithm and k_value from the prefix
|
|
271
236
|
algorithm, k_value_str = cls.parse_clustering_prefix(feature.get_name())
|
|
237
|
+
k_value: Union[int, str] = "auto" if k_value_str == "auto" else int(k_value_str)
|
|
238
|
+
return algorithm, k_value
|
|
272
239
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
else:
|
|
277
|
-
k_value = int(k_value_str)
|
|
240
|
+
# Fall back to configuration-based
|
|
241
|
+
algorithm = feature.options.get(cls.ALGORITHM)
|
|
242
|
+
k_value_raw = feature.options.get(cls.K_VALUE)
|
|
278
243
|
|
|
279
|
-
|
|
280
|
-
|
|
244
|
+
if k_value_raw is None:
|
|
245
|
+
return algorithm, None
|
|
281
246
|
|
|
282
|
-
|
|
247
|
+
k_value = "auto" if k_value_raw == "auto" else int(k_value_raw)
|
|
248
|
+
return algorithm, k_value
|
|
283
249
|
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
250
|
+
@classmethod
|
|
251
|
+
def _extract_algorithm_k_value_and_source_features(cls, feature: Feature) -> tuple[str, Union[int, str], list[str]]:
|
|
252
|
+
"""
|
|
253
|
+
Extract algorithm, k_value, and source features from a feature.
|
|
287
254
|
|
|
288
|
-
|
|
289
|
-
k_value_raw = feature.options.get(cls.K_VALUE)
|
|
255
|
+
Tries string-based approach first, falls back to configuration-based.
|
|
290
256
|
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
257
|
+
Args:
|
|
258
|
+
feature: The feature to extract parameters from
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
Tuple of (algorithm, k_value, source_features_list)
|
|
262
|
+
|
|
263
|
+
Raises:
|
|
264
|
+
ValueError: If parameters cannot be extracted
|
|
265
|
+
"""
|
|
266
|
+
source_features = cls._extract_source_features(feature)
|
|
267
|
+
algorithm, k_value = cls._extract_clustering_params(feature)
|
|
296
268
|
|
|
297
|
-
if algorithm is None or k_value is None
|
|
298
|
-
raise ValueError(f"Could not extract algorithm
|
|
269
|
+
if algorithm is None or k_value is None:
|
|
270
|
+
raise ValueError(f"Could not extract algorithm and k_value from: {feature.name}")
|
|
299
271
|
|
|
300
272
|
return algorithm, k_value, source_features
|
|
301
273
|
|
|
@@ -27,14 +27,14 @@ except ImportError:
|
|
|
27
27
|
np = None # type: ignore[assignment]
|
|
28
28
|
|
|
29
29
|
|
|
30
|
-
from
|
|
30
|
+
from mloda import ComputeFramework
|
|
31
31
|
from mloda_plugins.compute_framework.base_implementations.pandas.dataframe import PandasDataFrame
|
|
32
32
|
from mloda_plugins.feature_group.experimental.clustering.base import ClusteringFeatureGroup
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class PandasClusteringFeatureGroup(ClusteringFeatureGroup):
|
|
36
36
|
@classmethod
|
|
37
|
-
def compute_framework_rule(cls) -> set[type[
|
|
37
|
+
def compute_framework_rule(cls) -> set[type[ComputeFramework]]:
|
|
38
38
|
"""Define the compute framework for this feature group."""
|
|
39
39
|
return {PandasDataFrame}
|
|
40
40
|
|